aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/xfs/Kconfig25
-rw-r--r--fs/xfs/Makefile13
-rw-r--r--fs/xfs/kmem.c110
-rw-r--r--fs/xfs/kmem.h29
-rw-r--r--fs/xfs/libxfs/xfs_ag.c555
-rw-r--r--fs/xfs/libxfs/xfs_ag.h218
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c70
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h29
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c628
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h125
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c304
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h23
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1962
-rw-r--r--fs/xfs/libxfs/xfs_attr.h646
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c505
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h11
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c240
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h11
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h31
-rw-r--r--fs/xfs/libxfs/xfs_bit.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c1284
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h116
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c184
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h16
-rw-r--r--fs/xfs/libxfs/xfs_btree.c803
-rw-r--r--fs/xfs/libxfs/xfs_btree.h255
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c880
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h123
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c79
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h42
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h49
-rw-r--r--fs/xfs/libxfs/xfs_defer.c640
-rw-r--r--fs/xfs/libxfs/xfs_defer.h91
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c118
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h16
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c65
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c52
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c37
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c52
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c95
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c70
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h18
-rw-r--r--fs/xfs/libxfs/xfs_format.h657
-rw-r--r--fs/xfs/libxfs/xfs_fs.h116
-rw-r--r--fs/xfs/libxfs/xfs_health.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c958
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h77
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c364
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h24
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c642
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h68
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c473
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h217
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h190
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h100
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c77
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h114
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c500
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h71
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c258
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h16
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c423
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h31
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c315
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h16
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c56
-rw-r--r--fs/xfs/libxfs/xfs_sb.c610
-rw-r--r--fs/xfs/libxfs/xfs_sb.h16
-rw-r--r--fs/xfs/libxfs/xfs_shared.h51
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c30
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c143
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c253
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h18
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h15
-rw-r--r--fs/xfs/libxfs/xfs_types.c139
-rw-r--r--fs/xfs/libxfs/xfs_types.h63
-rw-r--r--fs/xfs/scrub/agheader.c222
-rw-r--r--fs/xfs/scrub/agheader_repair.c208
-rw-r--r--fs/xfs/scrub/alloc.c15
-rw-r--r--fs/xfs/scrub/attr.c41
-rw-r--r--fs/xfs/scrub/attr.h5
-rw-r--r--fs/xfs/scrub/bitmap.c117
-rw-r--r--fs/xfs/scrub/bitmap.h23
-rw-r--r--fs/xfs/scrub/bmap.c214
-rw-r--r--fs/xfs/scrub/btree.c203
-rw-r--r--fs/xfs/scrub/btree.h21
-rw-r--r--fs/xfs/scrub/common.c156
-rw-r--r--fs/xfs/scrub/common.h76
-rw-r--r--fs/xfs/scrub/dabtree.c128
-rw-r--r--fs/xfs/scrub/dir.c115
-rw-r--r--fs/xfs/scrub/fscounters.c83
-rw-r--r--fs/xfs/scrub/health.c7
-rw-r--r--fs/xfs/scrub/ialloc.c31
-rw-r--r--fs/xfs/scrub/inode.c101
-rw-r--r--fs/xfs/scrub/parent.c23
-rw-r--r--fs/xfs/scrub/quota.c104
-rw-r--r--fs/xfs/scrub/refcount.c95
-rw-r--r--fs/xfs/scrub/repair.c145
-rw-r--r--fs/xfs/scrub/repair.h19
-rw-r--r--fs/xfs/scrub/rmap.c14
-rw-r--r--fs/xfs/scrub/rtbitmap.c65
-rw-r--r--fs/xfs/scrub/scrub.c127
-rw-r--r--fs/xfs/scrub/scrub.h18
-rw-r--r--fs/xfs/scrub/symlink.c17
-rw-r--r--fs/xfs/scrub/trace.c16
-rw-r--r--fs/xfs/scrub/trace.h92
-rw-r--r--fs/xfs/scrub/xfs_scrub.h4
-rw-r--r--fs/xfs/xfs_acl.c183
-rw-r--r--fs/xfs/xfs_acl.h19
-rw-r--r--fs/xfs/xfs_aops.c255
-rw-r--r--fs/xfs/xfs_attr_inactive.c34
-rw-r--r--fs/xfs/xfs_attr_item.c881
-rw-r--r--fs/xfs/xfs_attr_item.h54
-rw-r--r--fs/xfs/xfs_attr_list.c189
-rw-r--r--fs/xfs/xfs_bio_io.c14
-rw-r--r--fs/xfs/xfs_bmap_item.c465
-rw-r--r--fs/xfs/xfs_bmap_item.h17
-rw-r--r--fs/xfs/xfs_bmap_util.c674
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c1012
-rw-r--r--fs/xfs/xfs_buf.h164
-rw-r--r--fs/xfs/xfs_buf_item.c639
-rw-r--r--fs/xfs/xfs_buf_item.h48
-rw-r--r--fs/xfs/xfs_buf_item_recover.c1061
-rw-r--r--fs/xfs/xfs_dir2_readdir.c85
-rw-r--r--fs/xfs/xfs_discard.c25
-rw-r--r--fs/xfs/xfs_dquot.c696
-rw-r--r--fs/xfs/xfs_dquot.h134
-rw-r--r--fs/xfs/xfs_dquot_item.c154
-rw-r--r--fs/xfs/xfs_dquot_item.h16
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c201
-rw-r--r--fs/xfs/xfs_error.c46
-rw-r--r--fs/xfs/xfs_error.h34
-rw-r--r--fs/xfs/xfs_export.c24
-rw-r--r--fs/xfs/xfs_extent_busy.c51
-rw-r--r--fs/xfs/xfs_extent_busy.h10
-rw-r--r--fs/xfs/xfs_extfree_item.c422
-rw-r--r--fs/xfs/xfs_extfree_item.h47
-rw-r--r--fs/xfs/xfs_file.c756
-rw-r--r--fs/xfs/xfs_filestream.c49
-rw-r--r--fs/xfs/xfs_filestream.h4
-rw-r--r--fs/xfs/xfs_fsmap.c210
-rw-r--r--fs/xfs/xfs_fsmap.h6
-rw-r--r--fs/xfs/xfs_fsops.c372
-rw-r--r--fs/xfs/xfs_fsops.h4
-rw-r--r--fs/xfs/xfs_globals.c8
-rw-r--r--fs/xfs/xfs_health.c17
-rw-r--r--fs/xfs/xfs_icache.c2480
-rw-r--r--fs/xfs/xfs_icache.h121
-rw-r--r--fs/xfs/xfs_icreate_item.c159
-rw-r--r--fs/xfs/xfs_icreate_item.h2
-rw-r--r--fs/xfs/xfs_inode.c2865
-rw-r--r--fs/xfs/xfs_inode.h248
-rw-r--r--fs/xfs/xfs_inode_item.c690
-rw-r--r--fs/xfs/xfs_inode_item.h32
-rw-r--r--fs/xfs/xfs_inode_item_recover.c550
-rw-r--r--fs/xfs/xfs_ioctl.c1340
-rw-r--r--fs/xfs/xfs_ioctl.h49
-rw-r--r--fs/xfs/xfs_ioctl32.c149
-rw-r--r--fs/xfs/xfs_ioctl32.h24
-rw-r--r--fs/xfs/xfs_iomap.c439
-rw-r--r--fs/xfs/xfs_iomap.h13
-rw-r--r--fs/xfs/xfs_iops.c572
-rw-r--r--fs/xfs/xfs_iops.h13
-rw-r--r--fs/xfs/xfs_itable.c105
-rw-r--r--fs/xfs/xfs_itable.h6
-rw-r--r--fs/xfs/xfs_iunlink_item.c180
-rw-r--r--fs/xfs/xfs_iunlink_item.h27
-rw-r--r--fs/xfs/xfs_iwalk.c143
-rw-r--r--fs/xfs/xfs_iwalk.h2
-rw-r--r--fs/xfs/xfs_linux.h46
-rw-r--r--fs/xfs/xfs_log.c2705
-rw-r--r--fs/xfs/xfs_log.h123
-rw-r--r--fs/xfs/xfs_log_cil.c1338
-rw-r--r--fs/xfs/xfs_log_priv.h325
-rw-r--r--fs/xfs/xfs_log_recover.c3338
-rw-r--r--fs/xfs/xfs_message.c80
-rw-r--r--fs/xfs/xfs_message.h80
-rw-r--r--fs/xfs/xfs_mount.c729
-rw-r--r--fs/xfs/xfs_mount.h577
-rw-r--r--fs/xfs/xfs_mru_cache.c4
-rw-r--r--fs/xfs/xfs_notify_failure.c234
-rw-r--r--fs/xfs/xfs_ondisk.h63
-rw-r--r--fs/xfs/xfs_pnfs.c60
-rw-r--r--fs/xfs/xfs_pwork.c25
-rw-r--r--fs/xfs/xfs_pwork.h4
-rw-r--r--fs/xfs/xfs_qm.c551
-rw-r--r--fs/xfs/xfs_qm.h143
-rw-r--r--fs/xfs/xfs_qm_bhv.c24
-rw-r--r--fs/xfs/xfs_qm_syscalls.c569
-rw-r--r--fs/xfs/xfs_quota.h66
-rw-r--r--fs/xfs/xfs_quotaops.c78
-rw-r--r--fs/xfs/xfs_refcount_item.c397
-rw-r--r--fs/xfs/xfs_refcount_item.h17
-rw-r--r--fs/xfs/xfs_reflink.c839
-rw-r--r--fs/xfs/xfs_reflink.h8
-rw-r--r--fs/xfs/xfs_rmap_item.c427
-rw-r--r--fs/xfs/xfs_rmap_item.h19
-rw-r--r--fs/xfs/xfs_rtalloc.c173
-rw-r--r--fs/xfs/xfs_rtalloc.h26
-rw-r--r--fs/xfs/xfs_stats.c16
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c1294
-rw-r--r--fs/xfs/xfs_super.h10
-rw-r--r--fs/xfs/xfs_symlink.c112
-rw-r--r--fs/xfs/xfs_symlink.h5
-rw-r--r--fs/xfs/xfs_sysctl.c38
-rw-r--r--fs/xfs/xfs_sysctl.h4
-rw-r--r--fs/xfs/xfs_sysfs.c65
-rw-r--r--fs/xfs/xfs_sysfs.h11
-rw-r--r--fs/xfs/xfs_trace.c7
-rw-r--r--fs/xfs/xfs_trace.h1148
-rw-r--r--fs/xfs/xfs_trans.c803
-rw-r--r--fs/xfs/xfs_trans.h137
-rw-r--r--fs/xfs/xfs_trans_ail.c257
-rw-r--r--fs/xfs/xfs_trans_buf.c87
-rw-r--r--fs/xfs/xfs_trans_dquot.c562
-rw-r--r--fs/xfs/xfs_trans_priv.h29
-rw-r--r--fs/xfs/xfs_xattr.c192
-rw-r--r--fs/xfs/xfs_xattr.h13
222 files changed, 33264 insertions, 24132 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index e685299eb3d2..9fac5ea8d0e4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,31 @@ config XFS_FS
system of your root partition is compiled as a module, you'll need
to use an initial ramdisk (initrd) to boot.
+config XFS_SUPPORT_V4
+ bool "Support deprecated V4 (crc=0) format"
+ depends on XFS_FS
+ default y
+ help
+ The V4 filesystem format lacks certain features that are supported
+ by the V5 format, such as metadata checksumming, strengthened
+ metadata verification, and the ability to store timestamps past the
+ year 2038. Because of this, the V4 format is deprecated. All users
+ should upgrade by backing up their files, reformatting, and restoring
+ from the backup.
+
+ Administrators and users can detect a V4 filesystem by running
+ xfs_info against a filesystem mountpoint and checking for a string
+ beginning with "crc=". If the string "crc=0" is found, the
+ filesystem is a V4 filesystem. If no such string is found, please
+ upgrade xfsprogs to the latest version and try again.
+
+ This option will become default N in September 2025. Support for the
+ V4 format will be removed entirely in September 2030. Distributors
+ can say N here to withdraw support earlier.
+
+ To continue supporting the old V4 format (crc=0), say Y.
+ To close off an attack surface, say N.
+
config XFS_QUOTA
bool "XFS Quota support"
depends on XFS_FS
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index aceca2f9a3db..03135a1c31b6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -7,8 +7,6 @@
ccflags-y += -I $(srctree)/$(src) # needed for trace events
ccflags-y += -I $(srctree)/$(src)/libxfs
-ccflags-$(CONFIG_XFS_DEBUG) += -g
-
obj-$(CONFIG_XFS_FS) += xfs.o
# this one should be compiled first, as the tracing macros can easily blow up
@@ -26,6 +24,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
+ xfs_btree_staging.o \
xfs_da_btree.o \
xfs_defer.o \
xfs_dir2.o \
@@ -100,9 +99,14 @@ xfs-y += xfs_log.o \
xfs_log_cil.o \
xfs_bmap_item.o \
xfs_buf_item.o \
+ xfs_buf_item_recover.o \
+ xfs_dquot_item_recover.o \
xfs_extfree_item.o \
+ xfs_attr_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
+ xfs_inode_item_recover.o \
+ xfs_iunlink_item.o \
xfs_refcount_item.o \
xfs_rmap_item.o \
xfs_log_recover.o \
@@ -126,6 +130,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
+# notify failure
+ifeq ($(CONFIG_MEMORY_FAILURE),y)
+xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
+endif
+
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 1da94237a8cf..c557a030acfe 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/backing-dev.h>
#include "xfs_message.h"
#include "xfs_trace.h"
@@ -26,113 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
current->comm, current->pid,
(unsigned int)size, __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- } while (1);
-}
-
-
-/*
- * __vmalloc() will allocate data pages and auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
- * we need to tell memory reclaim that we are in such a context via
- * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
- * and potentially deadlocking.
- */
-static void *
-__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
-{
- unsigned nofs_flag = 0;
- void *ptr;
- gfp_t lflags = kmem_flags_convert(flags);
-
- if (flags & KM_NOFS)
- nofs_flag = memalloc_nofs_save();
-
- ptr = __vmalloc(size, lflags, PAGE_KERNEL);
-
- if (flags & KM_NOFS)
- memalloc_nofs_restore(nofs_flag);
-
- return ptr;
-}
-
-/*
- * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
- * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
- * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
- * aligned region.
- */
-void *
-kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
-{
- void *ptr;
-
- trace_kmem_alloc_io(size, flags, _RET_IP_);
-
- if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
- align_mask = PAGE_SIZE - 1;
-
- ptr = kmem_alloc(size, flags | KM_MAYFAIL);
- if (ptr) {
- if (!((uintptr_t)ptr & align_mask))
- return ptr;
- kfree(ptr);
- }
- return __kmem_vmalloc(size, flags);
-}
-
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
-{
- void *ptr;
-
- trace_kmem_alloc_large(size, flags, _RET_IP_);
-
- ptr = kmem_alloc(size, flags | KM_MAYFAIL);
- if (ptr)
- return ptr;
- return __kmem_vmalloc(size, flags);
-}
-
-void *
-kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_realloc(newsize, flags, _RET_IP_);
-
- do {
- ptr = krealloc(old, newsize, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
- current->comm, current->pid,
- newsize, __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- } while (1);
-}
-
-void *
-kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
- do {
- ptr = kmem_cache_alloc(zone, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
- current->comm, current->pid,
- __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ memalloc_retry_wait(lflags);
} while (1);
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 6143117770e9..b987dc2c6851 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -19,6 +19,7 @@ typedef unsigned __bitwise xfs_km_flags_t;
#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
+#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u)
/*
* We use a special process flag to avoid recursive callbacks into
@@ -30,7 +31,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
{
gfp_t lflags;
- BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
+ BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
lflags = GFP_KERNEL | __GFP_NOWARN;
if (flags & KM_NOFS)
@@ -49,13 +50,13 @@ kmem_flags_convert(xfs_km_flags_t flags)
if (flags & KM_ZERO)
lflags |= __GFP_ZERO;
+ if (flags & KM_NOLOCKDEP)
+ lflags |= __GFP_NOLOCKDEP;
+
return lflags;
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
-extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
-extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
@@ -68,27 +69,9 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
return kmem_alloc(size, flags | KM_ZERO);
}
-static inline void *
-kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc_large(size, flags | KM_ZERO);
-}
-
/*
* Zone interfaces
*/
-
-#define kmem_zone kmem_cache
-#define kmem_zone_t struct kmem_cache
-
-extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
-
-static inline void *
-kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
- return kmem_zone_alloc(zone, flags | KM_ZERO);
-}
-
static inline struct page *
kmem_to_page(void *addr)
{
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 08d6beb54f8c..bb0c700afe3c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -22,6 +22,343 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+
+
+/*
+ * Passive reference counting access wrappers to the perag structures. If the
+ * per-ag structure is to be freed, the freeing code is responsible for cleaning
+ * up objects with passive references before freeing the structure. This is
+ * things like cached buffers.
+ */
+struct xfs_perag *
+xfs_perag_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ref = 0;
+
+ rcu_read_lock();
+ pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ if (pag) {
+ ASSERT(atomic_read(&pag->pag_ref) >= 0);
+ ref = atomic_inc_return(&pag->pag_ref);
+ }
+ rcu_read_unlock();
+ trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+ return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ unsigned int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+ int ref;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ ref = atomic_inc_return(&pag->pag_ref);
+ rcu_read_unlock();
+ trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+ return pag;
+}
+
+void
+xfs_perag_put(
+ struct xfs_perag *pag)
+{
+ int ref;
+
+ ASSERT(atomic_read(&pag->pag_ref) > 0);
+ ref = atomic_dec_return(&pag->pag_ref);
+ trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount)
+{
+ xfs_agnumber_t index;
+ struct xfs_perag *pag;
+ struct xfs_sb *sbp = &mp->m_sb;
+ uint64_t ifree = 0;
+ uint64_t ialloc = 0;
+ uint64_t bfree = 0;
+ uint64_t bfreelst = 0;
+ uint64_t btree = 0;
+ uint64_t fdblocks;
+ int error = 0;
+
+ for (index = 0; index < agcount; index++) {
+ /*
+ * Read the AGF and AGI buffers to populate the per-ag
+ * structures for us.
+ */
+ pag = xfs_perag_get(mp, index);
+ error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
+ if (!error)
+ error = xfs_ialloc_read_agi(pag, NULL, NULL);
+ if (error) {
+ xfs_perag_put(pag);
+ return error;
+ }
+
+ ifree += pag->pagi_freecount;
+ ialloc += pag->pagi_count;
+ bfree += pag->pagf_freeblks;
+ bfreelst += pag->pagf_flcount;
+ btree += pag->pagf_btreeblks;
+ xfs_perag_put(pag);
+ }
+ fdblocks = bfree + bfreelst + btree;
+
+ /*
+ * If the new summary counts are obviously incorrect, fail the
+ * mount operation because that implies the AGFs are also corrupt.
+ * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
+ * will prevent xfs_repair from fixing anything.
+ */
+ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
+ xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Overwrite incore superblock counters with just-read data */
+ spin_lock(&mp->m_sb_lock);
+ sbp->sb_ifree = ifree;
+ sbp->sb_icount = ialloc;
+ sbp->sb_fdblocks = fdblocks;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_reinit_percpu_counters(mp);
+out:
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+ return error;
+}
+
+STATIC void
+__xfs_free_perag(
+ struct rcu_head *head)
+{
+ struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+ ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+ kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+void
+xfs_free_perag(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, agno);
+ spin_unlock(&mp->m_perag_lock);
+ ASSERT(pag);
+ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
+
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ xfs_buf_hash_destroy(pag);
+
+ call_rcu(&pag->rcu_head, __xfs_free_perag);
+ }
+}
+
+/* Find the size of the AG, in blocks. */
+static xfs_agblock_t
+__xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks)
+{
+ ASSERT(agno < agcount);
+
+ if (agno < agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks);
+}
+
+/* Calculate the first and last possible inode number in an AG. */
+static void
+__xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agblock_t eoag,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ xfs_agblock_t bno;
+
+ /*
+ * Calculate the first inode, which will be in the first
+ * cluster-aligned block after the AGFL.
+ */
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+ *first = XFS_AGB_TO_AGINO(mp, bno);
+
+ /*
+ * Calculate the last inode, which will be at the end of the
+ * last (aligned) cluster that can be allocated in the AG.
+ */
+ bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+ *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+}
+
+void
+xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
+}
+
+int
+xfs_initialize_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks,
+ xfs_agnumber_t *maxagi)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+ xfs_agnumber_t first_initialised = NULLAGNUMBER;
+ int error;
+
+ /*
+ * Walk the current per-ag tree so we don't try to initialise AGs
+ * that already exist (growfs case). Allocate and insert all the
+ * AGs we don't find ready for initialisation.
+ */
+ for (index = 0; index < agcount; index++) {
+ pag = xfs_perag_get(mp, index);
+ if (pag) {
+ xfs_perag_put(pag);
+ continue;
+ }
+
+ pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ if (!pag) {
+ error = -ENOMEM;
+ goto out_unwind_new_pags;
+ }
+ pag->pag_agno = index;
+ pag->pag_mount = mp;
+
+ error = radix_tree_preload(GFP_NOFS);
+ if (error)
+ goto out_free_pag;
+
+ spin_lock(&mp->m_perag_lock);
+ if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+ WARN_ON_ONCE(1);
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+ error = -EEXIST;
+ goto out_free_pag;
+ }
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+
+#ifdef __KERNEL__
+ /* Place kernel structure only init below this point. */
+ spin_lock_init(&pag->pag_ici_lock);
+ spin_lock_init(&pag->pagb_lock);
+ spin_lock_init(&pag->pag_state_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ init_waitqueue_head(&pag->pagb_wait);
+ pag->pagb_count = 0;
+ pag->pagb_tree = RB_ROOT;
+#endif /* __KERNEL__ */
+
+ error = xfs_buf_hash_init(pag);
+ if (error)
+ goto out_remove_pag;
+
+ /* first new pag is fully initialized */
+ if (first_initialised == NULLAGNUMBER)
+ first_initialised = index;
+
+ /*
+ * Pre-calculated geometry
+ */
+ pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+ dblocks);
+ pag->min_block = XFS_AGFL_BLOCK(mp);
+ __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ }
+
+ index = xfs_set_inode_alloc(mp, agcount);
+
+ if (maxagi)
+ *maxagi = index;
+
+ mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
+ return 0;
+
+out_remove_pag:
+ radix_tree_delete(&mp->m_perag_tree, index);
+out_free_pag:
+ kmem_free(pag);
+out_unwind_new_pags:
+ /* unwind any prior newly initialized pags */
+ for (index = first_initialised; index < agcount; index++) {
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ if (!pag)
+ break;
+ xfs_buf_hash_destroy(pag);
+ kmem_free(pag);
+ }
+ return error;
+}
static int
xfs_get_aghdr_buf(
@@ -38,8 +375,6 @@ xfs_get_aghdr_buf(
if (error)
return error;
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- bp->b_bn = blkno;
bp->b_maps[0].bm_bn = blkno;
bp->b_ops = ops;
@@ -47,12 +382,6 @@ xfs_get_aghdr_buf(
return 0;
}
-static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
-{
- return mp->m_sb.sb_logstart > 0 &&
- id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
-}
-
/*
* Generic btree root block init function
*/
@@ -78,7 +407,7 @@ xfs_freesp_init_recs(
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
struct xfs_alloc_rec *nrec;
xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp,
mp->m_sb.sb_logstart);
@@ -195,7 +524,7 @@ xfs_rmaproot_init(
rrec->rm_offset = 0;
/* account for refc btree root */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
rrec = XFS_RMAP_REC_ADDR(block, 5);
rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
rrec->rm_blockcount = cpu_to_be32(1);
@@ -205,7 +534,7 @@ xfs_rmaproot_init(
}
/* account for the log space */
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
rrec = XFS_RMAP_REC_ADDR(block,
be16_to_cpu(block->bb_numrecs) + 1);
rrec->rm_startblock = cpu_to_be32(
@@ -231,7 +560,7 @@ xfs_sbblock_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
xfs_sb_to_disk(dsb, &mp->m_sb);
dsb->sb_inprogress = 1;
@@ -243,7 +572,7 @@ xfs_agfblock_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_agf *agf = bp->b_addr;
xfs_extlen_t tmpsize;
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -254,7 +583,7 @@ xfs_agfblock_init(
agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
agf->agf_roots[XFS_BTNUM_RMAPi] =
cpu_to_be32(XFS_RMAP_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
@@ -267,16 +596,16 @@ xfs_agfblock_init(
tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
agf->agf_refcount_root = cpu_to_be32(
xfs_refc_block(mp));
agf->agf_refcount_level = cpu_to_be32(1);
agf->agf_refcount_blocks = cpu_to_be32(1);
}
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
int64_t logblocks = mp->m_sb.sb_logblocks;
be32_add_cpu(&agf->agf_freeblks, -logblocks);
@@ -295,13 +624,13 @@ xfs_agflblock_init(
__be32 *agfl_bno;
int bucket;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
agfl->agfl_seqno = cpu_to_be32(id->agno);
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
}
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+ agfl_bno = xfs_buf_to_agfl_bno(bp);
for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
}
@@ -312,7 +641,7 @@ xfs_agiblock_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ struct xfs_agi *agi = bp->b_addr;
int bucket;
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -325,14 +654,19 @@ xfs_agiblock_init(
agi->agi_freecount = 0;
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (xfs_has_finobt(mp)) {
agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
agi->agi_free_level = cpu_to_be32(1);
}
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ if (xfs_has_inobtcounts(mp)) {
+ agi->agi_iblocks = cpu_to_be32(1);
+ if (xfs_has_finobt(mp))
+ agi->agi_fblocks = cpu_to_be32(1);
+ }
}
typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
@@ -440,14 +774,14 @@ xfs_ag_init_headers(
.ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
- .need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
+ .need_init = xfs_has_finobt(mp)
},
{ /* RMAP root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
.ops = &xfs_rmapbt_buf_ops,
.work = &xfs_rmaproot_init,
- .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
+ .need_init = xfs_has_rmapbt(mp)
},
{ /* REFC root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
@@ -455,7 +789,7 @@ xfs_ag_init_headers(
.ops = &xfs_refcountbt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_REFC,
- .need_init = xfs_sb_version_hasreflink(&mp->m_sb)
+ .need_init = xfs_has_reflink(mp)
},
{ /* NULL terminating block */
.daddr = XFS_BUF_DADDR_NULL,
@@ -480,14 +814,134 @@ xfs_ag_init_headers(
return error;
}
+int
+xfs_ag_shrink_space(
+ struct xfs_perag *pag,
+ struct xfs_trans **tpp,
+ xfs_extlen_t delta)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_alloc_arg args = {
+ .tp = *tpp,
+ .mp = mp,
+ .type = XFS_ALLOCTYPE_THIS_BNO,
+ .minlen = delta,
+ .maxlen = delta,
+ .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+ .resv = XFS_AG_RESV_NONE,
+ .prod = 1
+ };
+ struct xfs_buf *agibp, *agfbp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ xfs_agblock_t aglen;
+ int error, err2;
+
+ ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+ error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
+ if (error)
+ return error;
+
+ agi = agibp->b_addr;
+
+ error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp);
+ if (error)
+ return error;
+
+ agf = agfbp->b_addr;
+ aglen = be32_to_cpu(agi->agi_length);
+ /* some extra paranoid checks before we shrink the ag */
+ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+ return -EFSCORRUPTED;
+ if (delta >= aglen)
+ return -EINVAL;
+
+ args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
+
+ /*
+ * Make sure that the last inode cluster cannot overlap with the new
+ * end of the AG, even if it's sparse.
+ */
+ error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
+ aglen - delta);
+ if (error)
+ return error;
+
+ /*
+ * Disable perag reservations so it doesn't cause the allocation request
+ * to fail. We'll reestablish reservation before we return.
+ */
+ error = xfs_ag_resv_free(pag);
+ if (error)
+ return error;
+
+ /* internal log shouldn't also show up in the free space btrees */
+ error = xfs_alloc_vextent(&args);
+ if (!error && args.agbno == NULLAGBLOCK)
+ error = -ENOSPC;
+
+ if (error) {
+ /*
+ * if extent allocation fails, need to roll the transaction to
+ * ensure that the AGFL fixup has been committed anyway.
+ */
+ xfs_trans_bhold(*tpp, agfbp);
+ err2 = xfs_trans_roll(tpp);
+ if (err2)
+ return err2;
+ xfs_trans_bjoin(*tpp, agfbp);
+ goto resv_init_out;
+ }
+
+ /*
+ * if successfully deleted from freespace btrees, need to confirm
+ * per-AG reservation works as expected.
+ */
+ be32_add_cpu(&agi->agi_length, -delta);
+ be32_add_cpu(&agf->agf_length, -delta);
+
+ err2 = xfs_ag_resv_init(pag, *tpp);
+ if (err2) {
+ be32_add_cpu(&agi->agi_length, delta);
+ be32_add_cpu(&agf->agf_length, delta);
+ if (err2 != -ENOSPC)
+ goto resv_err;
+
+ __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
+
+ /*
+ * Roll the transaction before trying to re-init the per-ag
+ * reservation. The new transaction is clean so it will cancel
+ * without any side effects.
+ */
+ error = xfs_defer_finish(tpp);
+ if (error)
+ return error;
+
+ error = -ENOSPC;
+ goto resv_init_out;
+ }
+ xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
+ xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
+ return 0;
+
+resv_init_out:
+ err2 = xfs_ag_resv_init(pag, *tpp);
+ if (!err2)
+ return error;
+resv_err:
+ xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return err2;
+}
+
/*
* Extent the AG indicated by the @id by the length passed in
*/
int
xfs_ag_extend_space(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct aghdr_init_data *id,
xfs_extlen_t len)
{
struct xfs_buf *bp;
@@ -495,27 +949,24 @@ xfs_ag_extend_space(
struct xfs_agf *agf;
int error;
- /*
- * Change the agi length.
- */
- error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+ ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+
+ error = xfs_ialloc_read_agi(pag, tp, &bp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(bp);
+ agi = bp->b_addr;
be32_add_cpu(&agi->agi_length, len);
- ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
- be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
/*
* Change agf length.
*/
- error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &bp);
if (error)
return error;
- agf = XFS_BUF_TO_AGF(bp);
+ agf = bp->b_addr;
be32_add_cpu(&agf->agf_length, len);
ASSERT(agf->agf_length == agi->agi_length);
xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
@@ -526,54 +977,55 @@ xfs_ag_extend_space(
* XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
* this doesn't actually exist in the rmap btree.
*/
- error = xfs_rmap_free(tp, bp, id->agno,
- be32_to_cpu(agf->agf_length) - len,
+ error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len,
len, &XFS_RMAP_OINFO_SKIP_UPDATE);
if (error)
return error;
- return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+ error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
be32_to_cpu(agf->agf_length) - len),
len, &XFS_RMAP_OINFO_SKIP_UPDATE,
XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /* Update perag geometry */
+ pag->block_count = be32_to_cpu(agf->agf_length);
+ __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ return 0;
}
/* Retrieve AG geometry. */
int
xfs_ag_get_geometry(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_ag_geometry *ageo)
{
struct xfs_buf *agi_bp;
struct xfs_buf *agf_bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
- struct xfs_perag *pag;
unsigned int freeblks;
int error;
- if (agno >= mp->m_sb.sb_agcount)
- return -EINVAL;
-
/* Lock the AG headers. */
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
if (error)
return error;
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
if (error)
goto out_agi;
- pag = xfs_perag_get(mp, agno);
/* Fill out form. */
memset(ageo, 0, sizeof(*ageo));
- ageo->ag_number = agno;
+ ageo->ag_number = pag->pag_agno;
- agi = XFS_BUF_TO_AGI(agi_bp);
+ agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
- agf = XFS_BUF_TO_AGF(agf_bp);
+ agf = agf_bp->b_addr;
ageo->ag_length = be32_to_cpu(agf->agf_length);
freeblks = pag->pagf_freeblks +
pag->pagf_flcount +
@@ -583,7 +1035,6 @@ xfs_ag_get_geometry(
xfs_ag_geom_health(pag, ageo);
/* Release resources. */
- xfs_perag_put(pag);
xfs_buf_relse(agf_bp);
out_agi:
xfs_buf_relse(agi_bp);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 5166322807e7..191b22b9a35b 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -9,6 +9,215 @@
struct xfs_mount;
struct xfs_trans;
+struct xfs_perag;
+
+/*
+ * Per-ag infrastructure
+ */
+
+/* per-AG block reservation data structures*/
+struct xfs_ag_resv {
+ /* number of blocks originally reserved here */
+ xfs_extlen_t ar_orig_reserved;
+ /* number of blocks reserved here */
+ xfs_extlen_t ar_reserved;
+ /* number of blocks originally asked for */
+ xfs_extlen_t ar_asked;
+};
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection.
+ */
+struct xfs_perag {
+ struct xfs_mount *pag_mount; /* owner filesystem */
+ xfs_agnumber_t pag_agno; /* AG this structure belongs to */
+ atomic_t pag_ref; /* perag reference count */
+ char pagf_init; /* this agf's entry is initialized */
+ char pagi_init; /* this agi's entry is initialized */
+ char pagf_metadata; /* the agf is preferred to be metadata */
+ char pagi_inodeok; /* The agi is ok for inodes */
+ uint8_t pagf_levels[XFS_BTNUM_AGF];
+ /* # of levels in bno & cnt btree */
+ bool pagf_agflreset; /* agfl requires reset before use */
+ uint32_t pagf_flcount; /* count of blocks in freelist */
+ xfs_extlen_t pagf_freeblks; /* total free blocks */
+ xfs_extlen_t pagf_longest; /* longest free space */
+ uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
+ xfs_agino_t pagi_freecount; /* number of free inodes */
+ xfs_agino_t pagi_count; /* number of allocated inodes */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
+
+ int pagb_count; /* pagb slots in use */
+ uint8_t pagf_refcount_level; /* recount btree height */
+
+ /* Blocks reserved for all kinds of metadata. */
+ struct xfs_ag_resv pag_meta_resv;
+ /* Blocks reserved for the reverse mapping btree. */
+ struct xfs_ag_resv pag_rmapbt_resv;
+
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
+
+ /* Precalculated geometry info */
+ xfs_agblock_t block_count;
+ xfs_agblock_t min_block;
+ xfs_agino_t agino_min;
+ xfs_agino_t agino_max;
+
+#ifdef __KERNEL__
+ /* -- kernel only structures below this line -- */
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold pag_state_lock before accessing this field.
+ */
+ uint16_t pag_checked;
+ uint16_t pag_sick;
+ spinlock_t pag_state_lock;
+
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
+ unsigned int pagb_gen; /* generation count for pagb_tree */
+ wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
+
+ atomic_t pagf_fstrms; /* # of filestreams active in this AG */
+
+ spinlock_t pag_ici_lock; /* incore inode cache lock */
+ struct radix_tree_root pag_ici_root; /* incore inode cache root */
+ int pag_ici_reclaimable; /* reclaimable inodes */
+ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
+ struct rhashtable pag_buf_hash;
+
+ /* background prealloc block trimming */
+ struct delayed_work pag_blockgc_work;
+
+#endif /* __KERNEL__ */
+};
+
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
+ xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
+int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_free_perag(struct xfs_mount *mp);
+
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+ unsigned int tag);
+void xfs_perag_put(struct xfs_perag *pag);
+
+/*
+ * Per-ag geometry infomation and validation
+ */
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t *first, xfs_agino_t *last);
+
+static inline bool
+xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
+{
+ if (agbno >= pag->block_count)
+ return false;
+ if (agbno <= pag->min_block)
+ return false;
+ return true;
+}
+
+static inline bool
+xfs_verify_agbext(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_agblock_t len)
+{
+ if (agbno + len <= agbno)
+ return false;
+
+ if (!xfs_verify_agbno(pag, agbno))
+ return false;
+
+ return xfs_verify_agbno(pag, agbno + len - 1);
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino < pag->agino_min)
+ return false;
+ if (agino > pag->agino_max)
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+static inline bool
+xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino == NULLAGINO)
+ return true;
+ return xfs_verify_agino(pag, agino);
+}
+
+static inline bool
+xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+ return mp->m_sb.sb_logstart > 0 &&
+ agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
+/*
+ * Perag iteration APIs
+ */
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_perag *pag,
+ xfs_agnumber_t *agno,
+ xfs_agnumber_t end_agno)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ *agno = pag->pag_agno + 1;
+ xfs_perag_put(pag);
+ if (*agno > end_agno)
+ return NULL;
+ return xfs_perag_get(mp, *agno);
+}
+
+#define for_each_perag_range(mp, agno, end_agno, pag) \
+ for ((pag) = xfs_perag_get((mp), (agno)); \
+ (pag) != NULL; \
+ (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
+
+#define for_each_perag_from(mp, agno, pag) \
+ for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
+
+
+#define for_each_perag(mp, agno, pag) \
+ (agno) = 0; \
+ for_each_perag_from((mp), (agno), (pag))
+
+#define for_each_perag_tag(mp, agno, pag, tag) \
+ for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+ (pag) != NULL; \
+ (agno) = (pag)->pag_agno + 1, \
+ xfs_perag_put(pag), \
+ (pag) = xfs_perag_get_tag((mp), (agno), (tag)))
struct aghdr_init_data {
/* per ag data */
@@ -24,9 +233,10 @@ struct aghdr_init_data {
};
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
-int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
- struct aghdr_init_data *id, xfs_extlen_t len);
-int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_ag_geometry *ageo);
+int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
+ xfs_extlen_t delta);
+int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
+ xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fdfe6dc0d307..5af123d13a63 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -19,7 +19,7 @@
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_sb.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
@@ -91,7 +91,8 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+ return XFS_TEST_ERROR(avail < orig / 10 ||
+ avail < pag->pag_mount->m_agbtree_maxlevels,
pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
}
@@ -211,7 +212,11 @@ __xfs_ag_resv_init(
ASSERT(0);
return -EINVAL;
}
- error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+ error = -ENOSPC;
+ else
+ error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_);
@@ -246,20 +251,20 @@ xfs_ag_resv_init(
struct xfs_trans *tp)
{
struct xfs_mount *mp = pag->pag_mount;
- xfs_agnumber_t agno = pag->pag_agno;
xfs_extlen_t ask;
xfs_extlen_t used;
- int error = 0;
+ int error = 0, error2;
+ bool has_resv = false;
/* Create the metadata reservation. */
if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0;
- error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used);
+ error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
if (error)
goto out;
- error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used);
+ error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
if (error)
goto out;
@@ -277,7 +282,7 @@ xfs_ag_resv_init(
mp->m_finobt_nores = true;
- error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
+ error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
&used);
if (error)
goto out;
@@ -287,32 +292,55 @@ xfs_ag_resv_init(
if (error)
goto out;
}
+ if (ask)
+ has_resv = true;
}
/* Create the RMAPBT metadata reservation */
if (pag->pag_rmapbt_resv.ar_asked == 0) {
ask = used = 0;
- error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used);
+ error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
if (error)
goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
if (error)
goto out;
+ if (ask)
+ has_resv = true;
}
-#ifdef DEBUG
- /* need to read in the AGF for the ASSERT below to work */
- error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0);
- if (error)
- return error;
-
- ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
- xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
- pag->pagf_freeblks + pag->pagf_flcount);
-#endif
out:
+ /*
+ * Initialize the pagf if we have at least one active reservation on the
+ * AG. This may have occurred already via reservation calculation, but
+ * fall back to an explicit init to ensure the in-core allocbt usage
+ * counters are initialized as soon as possible. This is important
+ * because filesystems with large perag reservations are susceptible to
+ * free space reservation problems that the allocbt counter is used to
+ * address.
+ */
+ if (has_resv) {
+ error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
+ if (error2)
+ return error2;
+
+ /*
+ * If there isn't enough space in the AG to satisfy the
+ * reservation, let the caller know that there wasn't enough
+ * space. Callers are responsible for deciding what to do
+ * next, since (in theory) we can stumble along with
+ * insufficient reservation if data blocks are being freed to
+ * replenish the AG's free space.
+ */
+ if (!error &&
+ xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+ xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
+ pag->pagf_freeblks + pag->pagf_flcount)
+ error = -ENOSPC;
+ }
+
return error;
}
@@ -338,7 +366,7 @@ xfs_ag_resv_alloc_extent(
break;
default:
ASSERT(0);
- /* fall through */
+ fallthrough;
case XFS_AG_RESV_NONE:
field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
XFS_TRANS_SB_FDBLOCKS;
@@ -380,7 +408,7 @@ xfs_ag_resv_free_extent(
break;
default:
ASSERT(0);
- /* fall through */
+ fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
return;
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index c0352edc8e41..b74b210008ea 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
@@ -18,6 +18,21 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type,
struct xfs_trans *tp, xfs_extlen_t len);
+static inline struct xfs_ag_resv *
+xfs_perag_resv(
+ struct xfs_perag *pag,
+ enum xfs_ag_resv_type type)
+{
+ switch (type) {
+ case XFS_AG_RESV_METADATA:
+ return &pag->pag_meta_resv;
+ case XFS_AG_RESV_RMAPBT:
+ return &pag->pag_rmapbt_resv;
+ default:
+ return NULL;
+ }
+}
+
/*
* RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from
* the AGFL, they are allocated one at a time and the reservation updates don't
@@ -37,16 +52,4 @@ xfs_ag_resv_rmapbt_alloc(
xfs_perag_put(pag);
}
-static inline void
-xfs_ag_resv_rmapbt_free(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, agno);
- xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
- xfs_perag_put(pag);
-}
-
#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d8053bc96c4d..de79f5d07f65 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -10,7 +10,6 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
@@ -24,10 +23,11 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
-extern kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
@@ -51,7 +51,7 @@ xfs_agfl_size(
{
unsigned int size = mp->m_sb.sb_sectsize;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
size -= sizeof(struct xfs_agfl);
return size / sizeof(xfs_agblock_t);
@@ -61,9 +61,9 @@ unsigned int
xfs_refc_block(
struct xfs_mount *mp)
{
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
return XFS_RMAP_BLOCK(mp) + 1;
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
return XFS_FIBT_BLOCK(mp) + 1;
return XFS_IBT_BLOCK(mp) + 1;
}
@@ -72,16 +72,34 @@ xfs_extlen_t
xfs_prealloc_blocks(
struct xfs_mount *mp)
{
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
return xfs_refc_block(mp) + 1;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
return XFS_RMAP_BLOCK(mp) + 1;
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
return XFS_FIBT_BLOCK(mp) + 1;
return XFS_IBT_BLOCK(mp) + 1;
}
/*
+ * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
+ * guarantee that we can refill the AGFL prior to allocating space in a nearly
+ * full AG. Although the space described by the free space btrees, the
+ * blocks used by the freesp btrees themselves, and the blocks owned by the
+ * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
+ * free space in the AG drop so low that the free space btrees cannot refill an
+ * empty AGFL up to the minimum level. Rather than grind through empty AGs
+ * until the fs goes down, we subtract this many AG blocks from the incore
+ * fdblocks to ensure user allocation does not overcommit the space the
+ * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
+ * withhold space from xfs_mod_fdblocks, so we do not account for that here.
+ */
+#define XFS_ALLOCBT_AGFL_RESERVE 4
+
+/*
+ * Compute the number of blocks that we set aside to guarantee the ability to
+ * refill the AGFL and handle a full bmap btree split.
+ *
* In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
* AGF buffer (PV 947395), we place constraints on the relationship among
* actual allocations for data blocks, freelist blocks, and potential file data
@@ -93,14 +111,14 @@ xfs_prealloc_blocks(
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
- * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
- * potential split of the file's bmap btree.
+ * For each AG, we need to reserve enough blocks to replenish a totally empty
+ * AGFL and 4 more to handle a potential split of the file's bmap btree.
*/
unsigned int
xfs_alloc_set_aside(
struct xfs_mount *mp)
{
- return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
+ return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
}
/*
@@ -124,13 +142,13 @@ xfs_alloc_ag_max_usable(
unsigned int blocks;
blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
- blocks += XFS_ALLOC_AGFL_RESERVE;
+ blocks += XFS_ALLOCBT_AGFL_RESERVE;
blocks += 3; /* AGF, AGI btree root blocks */
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
blocks++; /* finobt root block */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- blocks++; /* rmap root block */
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
+ blocks++; /* rmap root block */
+ if (xfs_has_reflink(mp))
blocks++; /* refcount root block */
return mp->m_sb.sb_agblocks - blocks;
@@ -151,7 +169,7 @@ xfs_alloc_lookup_eq(
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
- cur->bc_private.a.priv.abt.active = (*stat == 1);
+ cur->bc_ag.abt.active = (*stat == 1);
return error;
}
@@ -171,7 +189,7 @@ xfs_alloc_lookup_ge(
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
- cur->bc_private.a.priv.abt.active = (*stat == 1);
+ cur->bc_ag.abt.active = (*stat == 1);
return error;
}
@@ -190,7 +208,7 @@ xfs_alloc_lookup_le(
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
- cur->bc_private.a.priv.abt.active = (*stat == 1);
+ cur->bc_ag.abt.active = (*stat == 1);
return error;
}
@@ -198,7 +216,7 @@ static inline bool
xfs_alloc_cur_active(
struct xfs_btree_cur *cur)
{
- return cur && cur->bc_private.a.priv.abt.active;
+ return cur && cur->bc_ag.abt.active;
}
/*
@@ -230,7 +248,7 @@ xfs_alloc_get_rec(
int *stat) /* output: success/failure */
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
@@ -245,11 +263,7 @@ xfs_alloc_get_rec(
goto out_bad_rec;
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, *bno))
- goto out_bad_rec;
- if (*bno > *bno + *len)
- goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
+ if (!xfs_verify_agbext(pag, *bno, *len))
goto out_bad_rec;
return 0;
@@ -257,7 +271,8 @@ xfs_alloc_get_rec(
out_bad_rec:
xfs_warn(mp,
"%s Freespace BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
+ cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+ pag->pag_agno);
xfs_warn(mp,
"start block 0x%x block count 0x%x", *bno, *len);
return -EFSCORRUPTED;
@@ -426,8 +441,8 @@ xfs_alloc_fix_len(
*/
STATIC int /* error code */
xfs_alloc_fixup_trees(
- xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
- xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
+ struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
+ struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
xfs_agblock_t fbno, /* starting block of free extent */
xfs_extlen_t flen, /* length of free extent */
xfs_agblock_t rbno, /* starting block of returned extent */
@@ -488,8 +503,8 @@ xfs_alloc_fixup_trees(
struct xfs_btree_block *bnoblock;
struct xfs_btree_block *cntblock;
- bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
- cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
if (XFS_IS_CORRUPT(mp,
bnoblock->bb_numrecs !=
@@ -589,6 +604,7 @@ xfs_agfl_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
int i;
/*
@@ -597,7 +613,7 @@ xfs_agfl_verify(
* AGFL is what the AGF says is active. We can't get to the AGF, so we
* can't verify just those entries are valid.
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return NULL;
if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
@@ -614,8 +630,8 @@ xfs_agfl_verify(
return __this_address;
for (i = 0; i < xfs_agfl_size(mp); i++) {
- if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
- be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
+ be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return __this_address;
}
@@ -637,7 +653,7 @@ xfs_agfl_read_verify(
* AGFL is what the AGF says is active. We can't get to the AGF, so we
* can't verify just those entries are valid.
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
@@ -658,7 +674,7 @@ xfs_agfl_write_verify(
xfs_failaddr_t fa;
/* no verification of non-crc AGFLs */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
fa = xfs_agfl_verify(bp);
@@ -684,20 +700,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = {
/*
* Read in the allocation group free block array.
*/
-int /* error */
+int
xfs_alloc_read_agfl(
- xfs_mount_t *mp, /* mount point structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_buf_t **bpp) /* buffer for the ag free block array */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **bpp)
{
- xfs_buf_t *bp; /* return value */
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_buf *bp;
+ int error;
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
@@ -709,19 +724,17 @@ xfs_alloc_read_agfl(
STATIC int
xfs_alloc_update_counters(
struct xfs_trans *tp,
- struct xfs_perag *pag,
struct xfs_buf *agbp,
long len)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
- pag->pagf_freeblks += len;
+ agbp->b_pag->pagf_freeblks += len;
be32_add_cpu(&agf->agf_freeblks, len);
- xfs_trans_agblocks_delta(tp, len);
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
be32_to_cpu(agf->agf_length))) {
- xfs_buf_corruption_error(agbp);
+ xfs_buf_mark_corrupt(agbp);
return -EFSCORRUPTED;
}
@@ -777,7 +790,7 @@ xfs_alloc_cur_setup(
*/
if (!acur->cnt)
acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_CNT);
+ args->agbp, args->pag, XFS_BTNUM_CNT);
error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
if (error)
return error;
@@ -787,10 +800,10 @@ xfs_alloc_cur_setup(
*/
if (!acur->bnolt)
acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
+ args->agbp, args->pag, XFS_BTNUM_BNO);
if (!acur->bnogt)
acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
+ args->agbp, args->pag, XFS_BTNUM_BNO);
return i == 1 ? 0 : -ENOSPC;
}
@@ -907,7 +920,7 @@ xfs_alloc_cur_check(
deactivate = true;
out:
if (deactivate)
- cur->bc_private.a.priv.abt.active = false;
+ cur->bc_ag.abt.active = false;
trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
*new);
return 0;
@@ -922,13 +935,13 @@ xfs_alloc_cur_finish(
struct xfs_alloc_arg *args,
struct xfs_alloc_cur *acur)
{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
int error;
ASSERT(acur->cnt && acur->bnolt);
ASSERT(acur->bno >= acur->rec_bno);
ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
- ASSERT(acur->rec_bno + acur->rec_len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
acur->rec_len, acur->bno, acur->len, 0);
@@ -1026,6 +1039,7 @@ xfs_alloc_ag_vextent_small(
xfs_extlen_t *flenp, /* result length */
int *stat) /* status: 0-freelist, 1-normal/none */
{
+ struct xfs_agf *agf = args->agbp->b_addr;
int error = 0;
xfs_agblock_t fbno = NULLAGBLOCK;
xfs_extlen_t flen = 0;
@@ -1054,17 +1068,17 @@ xfs_alloc_ag_vextent_small(
if (args->minlen != 1 || args->alignment != 1 ||
args->resv == XFS_AG_RESV_AGFL ||
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
- args->minleft))
+ be32_to_cpu(agf->agf_flcount) <= args->minleft)
goto out;
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
+ &fbno, 0);
if (error)
goto error;
if (fbno == NULLAGBLOCK)
goto out;
- xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
(args->datatype & XFS_ALLOC_NOBUSY));
if (args->datatype & XFS_ALLOC_USERDATA) {
@@ -1079,9 +1093,7 @@ xfs_alloc_ag_vextent_small(
}
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
- if (XFS_IS_CORRUPT(args->mp,
- fbno >= be32_to_cpu(
- XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
+ if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
goto error;
}
@@ -1092,7 +1104,7 @@ xfs_alloc_ag_vextent_small(
* If we're feeding an AGFL block to something that doesn't live in the
* free space, we need to clear out the OWN_AG rmap.
*/
- error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+ error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
&XFS_RMAP_OINFO_AG);
if (error)
goto error;
@@ -1169,20 +1181,19 @@ xfs_alloc_ag_vextent(
/* if not file data, insert new block into the reverse map btree */
if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
- error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
args->agbno, args->len, &args->oinfo);
if (error)
return error;
}
if (!args->wasfromfl) {
- error = xfs_alloc_update_counters(args->tp, args->pag,
- args->agbp,
+ error = xfs_alloc_update_counters(args->tp, args->agbp,
-((long)(args->len)));
if (error)
return error;
- ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+ ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
args->agbno, args->len));
}
@@ -1203,8 +1214,9 @@ STATIC int /* error */
xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
- xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
- xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
+ struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
+ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
int error;
xfs_agblock_t fbno; /* start block of found extent */
xfs_extlen_t flen; /* length of found extent */
@@ -1220,7 +1232,7 @@ xfs_alloc_ag_vextent_exact(
* Allocate/initialize a cursor for the by-number freespace btree.
*/
bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
+ args->pag, XFS_BTNUM_BNO);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1280,9 +1292,8 @@ xfs_alloc_ag_vextent_exact(
* Allocate/initialize a cursor for the by-size btree.
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
- ASSERT(args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ args->pag, XFS_BTNUM_CNT);
+ ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
if (error) {
@@ -1353,7 +1364,7 @@ xfs_alloc_walk_iter(
if (error)
return error;
if (i == 0)
- cur->bc_private.a.priv.abt.active = false;
+ cur->bc_ag.abt.active = false;
if (count > 0)
count--;
@@ -1468,7 +1479,7 @@ xfs_alloc_ag_vextent_locality(
if (error)
return error;
if (i) {
- acur->cnt->bc_private.a.priv.abt.active = true;
+ acur->cnt->bc_ag.abt.active = true;
fbcur = acur->cnt;
fbinc = false;
}
@@ -1505,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock(
#ifdef DEBUG
/* Randomly don't execute the first algorithm. */
- if (prandom_u32() & 1)
+ if (prandom_u32_max(2))
return 0;
#endif
@@ -1515,8 +1526,8 @@ xfs_alloc_ag_vextent_lastblock(
* maxlen, go to the start of this block, and skip all those smaller
* than minlen.
*/
- if (len || args->alignment > 1) {
- acur->cnt->bc_ptrs[0] = 1;
+ if (*len || args->alignment > 1) {
+ acur->cnt->bc_levels[0].ptr = 1;
do {
error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
if (error)
@@ -1661,8 +1672,9 @@ STATIC int /* error */
xfs_alloc_ag_vextent_size(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
- xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
+ struct xfs_agf *agf = args->agbp->b_addr;
+ struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
+ struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
int error; /* error result */
xfs_agblock_t fbno; /* start of found freespace */
xfs_extlen_t flen; /* length of found freespace */
@@ -1677,9 +1689,8 @@ restart:
* Allocate and initialize a cursor for the by-size btree.
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
+ args->pag, XFS_BTNUM_CNT);
bno_cur = NULL;
- busy = false;
/*
* Look for an entry >= maxlen+alignment-1 blocks.
@@ -1840,7 +1851,7 @@ restart:
* Allocate and initialize a cursor for the by-block tree.
*/
bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
+ args->pag, XFS_BTNUM_BNO);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1851,8 +1862,7 @@ restart:
args->agbno = rbno;
if (XFS_IS_CORRUPT(args->mp,
args->agbno + args->len >
- be32_to_cpu(
- XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
+ be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
goto error0;
}
@@ -1888,7 +1898,6 @@ xfs_free_ag_extent(
enum xfs_ag_resv_type type)
{
struct xfs_mount *mp;
- struct xfs_perag *pag;
struct xfs_btree_cur *bno_cur;
struct xfs_btree_cur *cnt_cur;
xfs_agblock_t gtbno; /* start of right neighbor */
@@ -1901,12 +1910,13 @@ xfs_free_ag_extent(
int haveright; /* have a right neighbor */
int i;
int error;
+ struct xfs_perag *pag = agbp->b_pag;
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
if (!xfs_rmap_should_skip_owner_update(oinfo)) {
- error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
+ error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
if (error)
goto error0;
}
@@ -1914,7 +1924,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
@@ -1984,7 +1994,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+ cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -2168,10 +2178,8 @@ xfs_free_ag_extent(
/*
* Update the freespace totals in the ag and superblock.
*/
- pag = xfs_perag_get(mp, agno);
- error = xfs_alloc_update_counters(tp, pag, agbp, len);
- xfs_ag_resv_free_extent(pag, type, tp, len);
- xfs_perag_put(pag);
+ error = xfs_alloc_update_counters(tp, agbp, len);
+ xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
if (error)
goto error0;
@@ -2197,14 +2205,15 @@ xfs_free_ag_extent(
*/
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
xfs_mount_t *mp) /* file system mount structure */
{
- mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
+ mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
(mp->m_sb.sb_agblocks + 1) / 2);
+ ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
}
/*
@@ -2262,16 +2271,16 @@ xfs_alloc_min_freelist(
const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
- ASSERT(mp->m_ag_maxlevels > 0);
+ ASSERT(mp->m_alloc_maxlevels > 0);
/* space needed by-bno freespace btree */
min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
- mp->m_ag_maxlevels);
+ mp->m_alloc_maxlevels);
/* space needed reverse mapping used space btree */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
mp->m_rmap_maxlevels);
@@ -2380,7 +2389,7 @@ xfs_agfl_needs_reset(
int active;
/* no agfl header on v4 supers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return false;
/*
@@ -2424,7 +2433,7 @@ xfs_agfl_reset(
struct xfs_perag *pag)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
ASSERT(pag->pagf_agflreset);
trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
@@ -2446,7 +2455,7 @@ xfs_agfl_reset(
/*
* Defer an AGFL block free. This is effectively equivalent to
- * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ * xfs_free_extent_later() with some special handling particular to AGFL blocks.
*
* Deferring AGFL frees helps prevent log reservation overruns due to too many
* allocation operations in a transaction. AGFL frees are prone to this problem
@@ -2465,13 +2474,14 @@ xfs_defer_agfl_block(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent_free_item *new; /* new element */
- ASSERT(xfs_bmap_free_item_zone != NULL);
+ ASSERT(xfs_extfree_item_cache != NULL);
ASSERT(oinfo != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
- new->xefi_oinfo = *oinfo;
+ new->xefi_owner = oinfo->oi_owner;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
@@ -2479,6 +2489,101 @@ xfs_defer_agfl_block(
}
/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+__xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
+{
+ struct xfs_extent_free_item *new; /* new element */
+#ifdef DEBUG
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_extfree_item_cache != NULL);
+
+ new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ new->xefi_startblock = bno;
+ new->xefi_blockcount = (xfs_extlen_t)len;
+ if (skip_discard)
+ new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (oinfo) {
+ ASSERT(oinfo->oi_offset == 0);
+
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ new->xefi_owner = oinfo->oi_owner;
+ } else {
+ new->xefi_owner = XFS_RMAP_OWN_NULL;
+ }
+ trace_xfs_bmap_free_defer(tp->t_mountp,
+ XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+}
+
+#ifdef DEBUG
+/*
+ * Check if an AGF has a free extent record whose length is equal to
+ * args->minlen.
+ */
+STATIC int
+xfs_exact_minlen_extent_available(
+ struct xfs_alloc_arg *args,
+ struct xfs_buf *agbp,
+ int *stat)
+{
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error = 0;
+
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
+ args->pag, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 0) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 1 && flen != args->minlen)
+ *stat = 0;
+
+out:
+ xfs_btree_del_cursor(cnt_cur, error);
+
+ return error;
+}
+#endif
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -2501,7 +2606,7 @@ xfs_alloc_fix_freelist(
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
if (!pag->pagf_init) {
- error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2531,7 +2636,7 @@ xfs_alloc_fix_freelist(
* Can fail if we're not blocking on locks, and it's held.
*/
if (!agbp) {
- error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2549,6 +2654,15 @@ xfs_alloc_fix_freelist(
if (!xfs_alloc_space_available(args, need, flags))
goto out_agbp_relse;
+#ifdef DEBUG
+ if (args->alloc_minlen_only) {
+ int stat;
+
+ error = xfs_exact_minlen_extent_available(args, agbp, &stat);
+ if (error || !stat)
+ goto out_agbp_relse;
+ }
+#endif
/*
* Make the freelist shorter if it's too long.
*
@@ -2580,7 +2694,7 @@ xfs_alloc_fix_freelist(
else
targs.oinfo = XFS_RMAP_OINFO_AG;
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
- error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+ error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
@@ -2595,7 +2709,7 @@ xfs_alloc_fix_freelist(
targs.alignment = targs.minlen = targs.prod = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
goto out_agbp_relse;
@@ -2624,7 +2738,7 @@ xfs_alloc_fix_freelist(
* Put each allocated block on the list.
*/
for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
- error = xfs_alloc_put_freelist(tp, agbp,
+ error = xfs_alloc_put_freelist(pag, tp, agbp,
agflbp, bno, 0);
if (error)
goto out_agflbp_relse;
@@ -2648,26 +2762,25 @@ out_no_agbp:
* Get a block from the freelist.
* Returns with the buffer for the block gotten.
*/
-int /* error */
+int
xfs_alloc_get_freelist(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop, /* block address retrieved from freelist */
- int btreeblk) /* destination is a AGF btree */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agblock_t *bnop,
+ int btreeblk)
{
- xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
- xfs_agblock_t bno; /* block number returned */
- __be32 *agfl_bno;
- int error;
- int logflags;
- xfs_mount_t *mp = tp->t_mountp;
- xfs_perag_t *pag; /* per allocation group data */
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_buf *agflbp;
+ xfs_agblock_t bno;
+ __be32 *agfl_bno;
+ int error;
+ uint32_t logflags;
+ struct xfs_mount *mp = tp->t_mountp;
/*
* Freelist is empty, give up.
*/
- agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -2675,8 +2788,7 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
- &agflbp);
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
return error;
@@ -2684,17 +2796,15 @@ xfs_alloc_get_freelist(
/*
* Get the block number and update the data structures.
*/
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
- pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
- xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
@@ -2703,7 +2813,6 @@ xfs_alloc_get_freelist(
pag->pagf_btreeblks++;
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
@@ -2716,9 +2825,9 @@ xfs_alloc_get_freelist(
*/
void
xfs_alloc_log_agf(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* buffer for a.g. freelist header */
- int fields) /* mask of fields to be logged (XFS_AGF_...) */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte offset */
int last; /* last byte offset */
@@ -2745,7 +2854,7 @@ xfs_alloc_log_agf(
sizeof(xfs_agf_t)
};
- trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+ trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
@@ -2754,58 +2863,37 @@ xfs_alloc_log_agf(
}
/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int /* error */
-xfs_alloc_pagf_init(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags) /* XFS_ALLOC_FLAGS_... */
-{
- xfs_buf_t *bp;
- int error;
-
- error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
- if (!error)
- xfs_trans_brelse(tp, bp);
- return error;
-}
-
-/*
* Put the block on the freelist for the allocation group.
*/
-int /* error */
+int
xfs_alloc_put_freelist(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* buffer for a.g. freelist header */
- xfs_buf_t *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno, /* block being freed */
- int btreeblk) /* block came from a AGF btree */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_buf *agflbp,
+ xfs_agblock_t bno,
+ int btreeblk)
{
- xfs_agf_t *agf; /* a.g. freespace structure */
- __be32 *blockp;/* pointer to array entry */
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = agbp->b_addr;
+ __be32 *blockp;
int error;
- int logflags;
- xfs_mount_t *mp; /* mount structure */
- xfs_perag_t *pag; /* per allocation group data */
+ uint32_t logflags;
__be32 *agfl_bno;
int startoff;
- agf = XFS_BUF_TO_AGF(agbp);
- mp = tp->t_mountp;
+ if (!agflbp) {
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+ if (error)
+ return error;
+ }
- if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
- return error;
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
- pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
- xfs_trans_agflist_delta(tp, 1);
pag->pagf_flcount++;
logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
@@ -2814,13 +2902,12 @@ xfs_alloc_put_freelist(
pag->pagf_btreeblks--;
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
startoff = (char *)blockp - (char *)agflbp->b_addr;
@@ -2838,13 +2925,12 @@ xfs_agf_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_agf *agf = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (!xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
return __this_address;
}
@@ -2858,15 +2944,29 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
return __this_address;
+ if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
+ be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
+ mp->m_alloc_maxlevels ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
+ mp->m_alloc_maxlevels)
return __this_address;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ if (xfs_has_rmapbt(mp) &&
(be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
+ mp->m_rmap_maxlevels))
+ return __this_address;
+
+ if (xfs_has_rmapbt(mp) &&
+ be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
return __this_address;
/*
@@ -2878,13 +2978,18 @@ xfs_agf_verify(
if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
return __this_address;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ if (xfs_has_lazysbcount(mp) &&
be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
return __this_address;
- if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ if (xfs_has_reflink(mp) &&
+ be32_to_cpu(agf->agf_refcount_blocks) >
+ be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ if (xfs_has_reflink(mp) &&
(be32_to_cpu(agf->agf_refcount_level) < 1 ||
- be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
+ be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
return __this_address;
return NULL;
@@ -2898,7 +3003,7 @@ xfs_agf_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -2914,6 +3019,7 @@ xfs_agf_write_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agf *agf = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_agf_verify(bp);
@@ -2922,11 +3028,11 @@ xfs_agf_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
- XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
}
@@ -2942,60 +3048,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = {
/*
* Read in the allocation group header (free/alloc section).
*/
-int /* error */
+int
xfs_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_BUF_ */
- struct xfs_buf **bpp) /* buffer for the ag freelist header */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
{
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
- trace_xfs_read_agf(mp, agno);
+ trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
if (error)
return error;
- ASSERT(!(*bpp)->b_error);
- xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+ xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
return 0;
}
/*
- * Read in the allocation group header (free/alloc section).
+ * Read in the allocation group header (free/alloc section) and initialise the
+ * perag structure if necessary. If the caller provides @agfbpp, then return the
+ * locked buffer to the caller, otherwise free it.
*/
-int /* error */
+int
xfs_alloc_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- struct xfs_buf **bpp) /* buffer for the ag freelist header */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
{
- struct xfs_agf *agf; /* ag freelist header */
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_buf *agfbp;
+ struct xfs_agf *agf;
int error;
+ int allocbt_blks;
- trace_xfs_alloc_read_agf(mp, agno);
+ trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
/* We don't support trylock when freeing. */
ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
- ASSERT(agno != NULLAGNUMBER);
- error = xfs_read_agf(mp, tp, agno,
+ error = xfs_read_agf(pag, tp,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
- bpp);
+ &agfbp);
if (error)
return error;
- ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
- pag = xfs_perag_get(mp, agno);
+ agf = agfbp->b_addr;
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -3009,10 +3112,24 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
pag->pagf_init = 1;
- pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
+ pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
+
+ /*
+ * Update the in-core allocbt counter. Filter out the rmapbt
+ * subset of the btreeblks counter because the rmapbt is managed
+ * by perag reservation. Subtract one for the rmapbt root block
+ * because the rmap counter includes it while the btreeblks
+ * counter only tracks non-root blocks.
+ */
+ allocbt_blks = pag->pagf_btreeblks;
+ if (xfs_has_rmapbt(pag->pag_mount))
+ allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
+ if (allocbt_blks > 0)
+ atomic64_add(allocbt_blks,
+ &pag->pag_mount->m_allocbt_blks);
}
#ifdef DEBUG
- else if (!XFS_FORCED_SHUTDOWN(mp)) {
+ else if (!xfs_is_shutdown(pag->pag_mount)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3023,7 +3140,10 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
- xfs_perag_put(pag);
+ if (agfbpp)
+ *agfbpp = agfbp;
+ else
+ xfs_trans_brelse(tp, agfbp);
return 0;
}
@@ -3100,7 +3220,7 @@ xfs_alloc_vextent(
* the first a.g. fails.
*/
if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
- (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+ xfs_is_inode32(mp)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
mp->m_sb.sb_agcount), 0);
@@ -3108,7 +3228,7 @@ xfs_alloc_vextent(
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
- /* FALLTHROUGH */
+ fallthrough;
case XFS_ALLOCTYPE_FIRST_AG:
/*
* Rotate through the allocation groups looking for a winner.
@@ -3226,7 +3346,7 @@ error0:
int
xfs_free_extent_fix_freelist(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_buf **agbp)
{
struct xfs_alloc_arg args;
@@ -3235,7 +3355,8 @@ xfs_free_extent_fix_freelist(
memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
- args.agno = agno;
+ args.agno = pag->pag_agno;
+ args.pag = pag;
/*
* validate that the block number is legal - the enables us to detect
@@ -3244,17 +3365,12 @@ xfs_free_extent_fix_freelist(
if (args.agno >= args.mp->m_sb.sb_agcount)
return -EFSCORRUPTED;
- args.pag = xfs_perag_get(args.mp, args.agno);
- ASSERT(args.pag);
-
error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
if (error)
- goto out;
+ return error;
*agbp = args.agbp;
-out:
- xfs_perag_put(args.pag);
- return error;
+ return 0;
}
/*
@@ -3275,8 +3391,10 @@ __xfs_free_extent(
struct xfs_buf *agbp;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
+ struct xfs_perag *pag;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3285,34 +3403,37 @@ __xfs_free_extent(
XFS_ERRTAG_FREE_EXTENT))
return -EIO;
- error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error)
- return error;
+ goto err;
+ agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
error = -EFSCORRUPTED;
- goto err;
+ goto err_release;
}
/* validate the extent size is legal now we have the agf locked */
- if (XFS_IS_CORRUPT(mp,
- agbno + len >
- be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) {
+ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
error = -EFSCORRUPTED;
- goto err;
+ goto err_release;
}
error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
- goto err;
+ goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
- xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
+ xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
+ xfs_perag_put(pag);
return 0;
-err:
+err_release:
xfs_trans_brelse(tp, agbp);
+err:
+ xfs_perag_put(pag);
return error;
}
@@ -3325,7 +3446,7 @@ struct xfs_alloc_query_range_info {
STATIC int
xfs_alloc_query_range_helper(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
struct xfs_alloc_query_range_info *query = priv;
@@ -3340,8 +3461,8 @@ xfs_alloc_query_range_helper(
int
xfs_alloc_query_range(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *low_rec,
- struct xfs_alloc_rec_incore *high_rec,
+ const struct xfs_alloc_rec_incore *low_rec,
+ const struct xfs_alloc_rec_incore *high_rec,
xfs_alloc_query_range_fn fn,
void *priv)
{
@@ -3408,7 +3529,7 @@ xfs_agfl_walk(
unsigned int i;
int error;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
i = be32_to_cpu(agf->agf_flfirst);
/* Nothing to walk in an empty AGFL. */
@@ -3428,3 +3549,20 @@ xfs_agfl_walk(
return 0;
}
+
+int __init
+xfs_extfree_intent_init_cache(void)
+{
+ xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
+
+ return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_extfree_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_extfree_item_cache);
+ xfs_extfree_item_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7380fbe4a3ff..2c3f762dfb58 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -75,6 +75,9 @@ typedef struct xfs_alloc_arg {
char wasfromfl; /* set if allocation is from freelist */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
enum xfs_ag_resv_type resv; /* block reservation to use */
+#ifdef DEBUG
+ bool alloc_minlen_only; /* allocate exact minlen extent */
+#endif
} xfs_alloc_arg_t;
/*
@@ -85,7 +88,6 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
/* freespace limit calculations */
-#define XFS_ALLOC_AGFL_RESERVE 4
unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);
@@ -93,65 +95,27 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
xfs_extlen_t need, xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
+int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk);
+int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, struct xfs_buf *agflbp,
+ xfs_agblock_t bno, int btreeblk);
/*
- * Compute and fill in value of m_ag_maxlevels.
+ * Compute and fill in value of m_alloc_maxlevels.
*/
void
xfs_alloc_compute_maxlevels(
struct xfs_mount *mp); /* file system mount structure */
/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int /* error */
-xfs_alloc_get_freelist(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop, /* block address retrieved from freelist */
- int btreeblk); /* destination is a AGF btree */
-
-/*
* Log the given fields from the agf structure.
*/
void
xfs_alloc_log_agf(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields);/* mask of fields to be logged (XFS_AGF_...) */
-
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int /* error */
-xfs_alloc_pagf_init(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags); /* XFS_ALLOC_FLAGS_... */
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int /* error */
-xfs_alloc_put_freelist(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for a.g. freelist header */
- struct xfs_buf *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno, /* block being freed */
- int btreeblk); /* owner was a AGF btree */
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int /* error */
-xfs_alloc_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- struct xfs_buf **bpp); /* buffer for the ag freelist header */
+ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
* Allocate an extent (variable-size).
@@ -204,26 +168,28 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
-int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **bpp);
int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
-int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_buf **agbp);
xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
typedef int (*xfs_alloc_query_range_fn)(
- struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
- void *priv);
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *rec,
+ void *priv);
int xfs_alloc_query_range(struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *low_rec,
- struct xfs_alloc_rec_incore *high_rec,
+ const struct xfs_alloc_rec_incore *low_rec,
+ const struct xfs_alloc_rec_incore *high_rec,
xfs_alloc_query_range_fn fn, void *priv);
int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
void *priv);
@@ -236,4 +202,49 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+static inline __be32 *
+xfs_buf_to_agfl_bno(
+ struct xfs_buf *bp)
+{
+ if (xfs_has_crc(bp->b_mount))
+ return bp->b_addr + sizeof(struct xfs_agfl);
+ return bp->b_addr;
+}
+
+void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
+ bool skip_discard);
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+struct xfs_extent_free_item {
+ struct list_head xefi_list;
+ uint64_t xefi_owner;
+ xfs_fsblock_t xefi_startblock;/* starting fs block number */
+ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ unsigned int xefi_flags;
+};
+
+#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
+#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
+#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+
+static inline void
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo)
+{
+ __xfs_free_extent_later(tp, bno, len, oinfo, false);
+}
+
+
+extern struct kmem_cache *xfs_extfree_item_cache;
+
+int __init xfs_extfree_intent_init_cache(void);
+void xfs_extfree_intent_destroy_cache(void);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 279694d73e4e..549a3cba0234 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -9,61 +9,59 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
+#include "xfs_ag.h"
+static struct kmem_cache *xfs_allocbt_cur_cache;
STATIC struct xfs_btree_cur *
xfs_allocbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno,
- cur->bc_btnum);
+ cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
}
STATIC void
xfs_allocbt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
int btnum = cur->bc_btnum;
- struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
ASSERT(ptr->s != 0);
agf->agf_roots[btnum] = ptr->s;
be32_add_cpu(&agf->agf_levels[btnum], inc);
- pag->pagf_levels[btnum] += inc;
- xfs_perag_put(pag);
+ cur->bc_ag.pag->pagf_levels[btnum] += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
STATIC int
xfs_allocbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
int error;
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
- &bno, 1);
+ error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp, &bno, 1);
if (error)
return error;
@@ -72,9 +70,9 @@ xfs_allocbt_alloc_block(
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+ atomic64_inc(&cur->bc_mp->m_allocbt_blks);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
- xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
*stat = 1;
@@ -86,19 +84,19 @@ xfs_allocbt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
xfs_agblock_t bno;
int error;
- bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
- error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
+ error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
+ bno, 1);
if (error)
return error;
- xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ atomic64_dec(&cur->bc_mp->m_allocbt_blks);
+ xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
- xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
@@ -107,14 +105,13 @@ xfs_allocbt_free_block(
*/
STATIC void
xfs_allocbt_update_lastrec(
- struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- union xfs_btree_rec *rec,
- int ptr,
- int reason)
+ struct xfs_btree_cur *cur,
+ const struct xfs_btree_block *block,
+ const union xfs_btree_rec *rec,
+ int ptr,
+ int reason)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
struct xfs_perag *pag;
__be32 len;
int numrecs;
@@ -159,10 +156,9 @@ xfs_allocbt_update_lastrec(
}
agf->agf_longest = len;
- pag = xfs_perag_get(cur->bc_mp, seqno);
+ pag = cur->bc_ag.agbp->b_pag;
pag->pagf_longest = be32_to_cpu(len);
- xfs_perag_put(pag);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
}
STATIC int
@@ -183,8 +179,8 @@ xfs_allocbt_get_maxrecs(
STATIC void
xfs_allocbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->alloc.ar_startblock = rec->alloc.ar_startblock;
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
@@ -192,10 +188,10 @@ xfs_allocbt_init_key_from_rec(
STATIC void
xfs_bnobt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- __u32 x;
+ __u32 x;
x = be32_to_cpu(rec->alloc.ar_startblock);
x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
@@ -205,8 +201,8 @@ xfs_bnobt_init_high_key_from_rec(
STATIC void
xfs_cntbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
key->alloc.ar_startblock = 0;
@@ -226,32 +222,32 @@ xfs_allocbt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
STATIC int64_t
xfs_bnobt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
- xfs_alloc_key_t *kp = &key->alloc;
+ struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
+ const struct xfs_alloc_rec *kp = &key->alloc;
return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
STATIC int64_t
xfs_cntbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
- xfs_alloc_key_t *kp = &key->alloc;
- int64_t diff;
+ struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
+ const struct xfs_alloc_rec *kp = &key->alloc;
+ int64_t diff;
diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
if (diff)
@@ -262,9 +258,9 @@ xfs_cntbt_key_diff(
STATIC int64_t
xfs_bnobt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
be32_to_cpu(k2->alloc.ar_startblock);
@@ -272,11 +268,11 @@ xfs_bnobt_diff_two_keys(
STATIC int64_t
xfs_cntbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
- int64_t diff;
+ int64_t diff;
diff = be32_to_cpu(k1->alloc.ar_blockcount) -
be32_to_cpu(k2->alloc.ar_blockcount);
@@ -301,7 +297,7 @@ xfs_allocbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
@@ -322,7 +318,7 @@ xfs_allocbt_verify(
if (pag && pag->pagf_init) {
if (level >= pag->pagf_levels[btnum])
return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ } else if (level >= mp->m_alloc_maxlevels)
return __this_address;
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
@@ -382,9 +378,9 @@ const struct xfs_buf_ops xfs_cntbt_buf_ops = {
STATIC int
xfs_bnobt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->alloc.ar_startblock) <
be32_to_cpu(k2->alloc.ar_startblock);
@@ -392,9 +388,9 @@ xfs_bnobt_keys_inorder(
STATIC int
xfs_bnobt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->alloc.ar_startblock) +
be32_to_cpu(r1->alloc.ar_blockcount) <=
@@ -403,9 +399,9 @@ xfs_bnobt_recs_inorder(
STATIC int
xfs_cntbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->alloc.ar_blockcount) <
be32_to_cpu(k2->alloc.ar_blockcount) ||
@@ -416,9 +412,9 @@ xfs_cntbt_keys_inorder(
STATIC int
xfs_cntbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->alloc.ar_blockcount) <
be32_to_cpu(r2->alloc.ar_blockcount) ||
@@ -471,6 +467,41 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.recs_inorder = xfs_cntbt_recs_inorder,
};
+/* Allocate most of a new allocation btree cursor. */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
+ xfs_allocbt_cur_cache);
+ cur->bc_ag.abt.active = false;
+
+ if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_ops = &xfs_cntbt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+ cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+ } else {
+ cur->bc_ops = &xfs_bnobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ }
+
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
+ if (xfs_has_crc(mp))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ return cur;
+}
+
/*
* Allocate a new allocation btree cursor.
*/
@@ -479,43 +510,77 @@ xfs_allocbt_init_cursor(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer for agf structure */
- xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_perag *pag,
xfs_btnum_t btnum) /* btree identifier */
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
- if (btnum == XFS_BTNUM_CNT) {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
- cur->bc_ops = &xfs_cntbt_ops;
+ cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
+ if (btnum == XFS_BTNUM_CNT)
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
- } else {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- cur->bc_ops = &xfs_bnobt_ops;
+ else
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
- }
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
- cur->bc_private.a.priv.abt.active = false;
+ cur->bc_ag.agbp = agbp;
- if (xfs_sb_version_hascrc(&mp->m_sb))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+ return cur;
+}
+
+/* Create a free space btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_allocbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
/*
+ * Install a new free space btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_allocbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ } else {
+ cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ }
+}
+
+/* Calculate number of records in an alloc btree block. */
+static inline unsigned int
+xfs_allocbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
+
+/*
* Calculate number of records in an alloc btree block.
*/
int
@@ -525,10 +590,26 @@ xfs_allocbt_maxrecs(
int leaf)
{
blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+ return xfs_allocbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_alloc_rec_t);
- return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+/* Free space btrees are at their largest when every other block is free. */
+#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2)
+
+/* Compute the max possible height for free space btrees. */
+unsigned int
+xfs_allocbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS);
}
/* Calculate the freespace btree size for some records. */
@@ -539,3 +620,22 @@ xfs_allocbt_calc_size(
{
return xfs_btree_calc_size(mp->m_alloc_mnr, len);
}
+
+int __init
+xfs_allocbt_init_cur_cache(void)
+{
+ xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur",
+ xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_allocbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_allocbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_allocbt_cur_cache);
+ xfs_allocbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index c9305ebb69f6..45df893ef6bb 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -13,12 +13,14 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xfs_perag;
+struct xbtree_afakeroot;
/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_ALLOC_BLOCK_LEN(mp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ (xfs_has_crc(((mp))) ? \
XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
@@ -45,11 +47,22 @@ struct xfs_mount;
(maxrecs) * sizeof(xfs_alloc_key_t) + \
((index) - 1) * sizeof(xfs_alloc_ptr_t)))
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *,
- xfs_agnumber_t, xfs_btnum_t);
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_perag *pag, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag,
+ xfs_btnum_t btnum);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_allocbt_maxlevels_ondisk(void);
+
+int __init xfs_allocbt_init_cur_cache(void);
+void xfs_allocbt_destroy_cur_cache(void);
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e6149720ce02..e28d93d232de 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -24,6 +24,10 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+
+struct kmem_cache *xfs_attr_intent_cache;
/*
* xfs_attr.c
@@ -44,55 +48,169 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
* Internal routines when attribute list is one block.
*/
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
/*
* Internal routines when attribute list is more than one block.
*/
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
-STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
-STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
+static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_lookup(struct xfs_da_args *args,
+ struct xfs_da_state *state);
+int
+xfs_inode_hasattr(
+ struct xfs_inode *ip)
+{
+ if (!xfs_inode_has_attr_fork(ip))
+ return 0;
+ if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0)
+ return 0;
+ return 1;
+}
-STATIC int
-xfs_attr_args_init(
- struct xfs_da_args *args,
- struct xfs_inode *dp,
- const unsigned char *name,
- size_t namelen,
- int flags)
+/*
+ * Returns true if the there is exactly only block in the attr fork, in which
+ * case the attribute fork consists of a single leaf block entry.
+ */
+bool
+xfs_attr_is_leaf(
+ struct xfs_inode *ip)
{
+ struct xfs_ifork *ifp = &ip->i_af;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap;
- if (!name)
- return -EINVAL;
+ if (ifp->if_nextents != 1 || ifp->if_format != XFS_DINODE_FMT_EXTENTS)
+ return false;
- memset(args, 0, sizeof(*args));
- args->geo = dp->i_mount->m_attr_geo;
- args->whichfork = XFS_ATTR_FORK;
- args->dp = dp;
- args->flags = flags;
- args->name = name;
- args->namelen = namelen;
- if (args->namelen >= MAXNAMELEN)
- return -EFAULT; /* match IRIX behaviour */
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_get_extent(ifp, &icur, &imap);
+ return imap.br_startoff == 0 && imap.br_blockcount == 1;
+}
+
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ trace_xfs_attr_fillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
- args->hashval = xfs_da_hashname(args->name, args->namelen);
return 0;
}
-int
-xfs_inode_hasattr(
- struct xfs_inode *ip)
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
{
- if (!XFS_IFORK_Q(ip) ||
- (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_anextents == 0))
- return 0;
- return 1;
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ trace_xfs_attr_refillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return 0;
}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
/*========================================================================
* Overall external interface routines.
@@ -104,91 +222,66 @@ xfs_inode_hasattr(
*/
int
xfs_attr_get_ilocked(
- struct xfs_inode *ip,
struct xfs_da_args *args)
{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- if (!xfs_inode_hasattr(ip))
+ if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
- else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+
+ if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
- else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+ if (xfs_attr_is_leaf(args->dp))
return xfs_attr_leaf_get(args);
- else
- return xfs_attr_node_get(args);
+ return xfs_attr_node_get(args);
}
/*
* Retrieve an extended attribute by name, and its value if requested.
*
- * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
- * just an indication whether the attribute exists and the size of the value if
- * it exists. The size is returned in @valuelenp,
+ * If args->valuelen is zero, then the caller does not want the value, just an
+ * indication whether the attribute exists and the size of the value if it
+ * exists. The size is returned in args.valuelen.
*
- * If the attribute is found, but exceeds the size limit set by the caller in
- * @valuelenp, return -ERANGE with the size of the attribute that was found in
- * @valuelenp.
+ * If args->value is NULL but args->valuelen is non-zero, allocate the buffer
+ * for the value after existence of the attribute has been determined. The
+ * caller always has to free args->value if it is set, no matter if this
+ * function was successful or not.
*
- * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
- * existence of the attribute has been determined. On success, return that
- * buffer to the caller and leave them to free it. On failure, free any
- * allocated buffer and ensure the buffer pointer returned to the caller is
- * null.
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * args->valuelen, return -ERANGE with the size of the attribute that was found
+ * in args->valuelen.
*/
int
xfs_attr_get(
- struct xfs_inode *ip,
- const unsigned char *name,
- size_t namelen,
- unsigned char **value,
- int *valuelenp,
- int flags)
+ struct xfs_da_args *args)
{
- struct xfs_da_args args;
uint lock_mode;
int error;
- ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
+ XFS_STATS_INC(args->dp->i_mount, xs_attr_get);
- XFS_STATS_INC(ip->i_mount, xs_attr_get);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(args->dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, ip, name, namelen, flags);
- if (error)
- return error;
+ args->geo = args->dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
/* Entirely possible to look up a name which doesn't exist */
- args.op_flags = XFS_DA_OP_OKNOENT;
- if (flags & ATTR_ALLOC)
- args.op_flags |= XFS_DA_OP_ALLOCVAL;
- else
- args.value = *value;
- args.valuelen = *valuelenp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
- lock_mode = xfs_ilock_attr_map_shared(ip);
- error = xfs_attr_get_ilocked(ip, &args);
- xfs_iunlock(ip, lock_mode);
- *valuelenp = args.valuelen;
+ lock_mode = xfs_ilock_attr_map_shared(args->dp);
+ error = xfs_attr_get_ilocked(args);
+ xfs_iunlock(args->dp, lock_mode);
- /* on error, we have to clean up allocated value buffers */
- if (error) {
- if (flags & ATTR_ALLOC) {
- kmem_free(args.value);
- *value = NULL;
- }
- return error;
- }
- *value = args.value;
- return 0;
+ return error;
}
/*
* Calculate how many blocks we need for the new attribute,
*/
-STATIC int
+int
xfs_attr_calc_size(
struct xfs_da_args *args,
int *local)
@@ -221,14 +314,46 @@ xfs_attr_calc_size(
return nblks;
}
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+ struct xfs_da_args *args,
+ struct xfs_trans_res *tres,
+ unsigned int *total)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+
+ if (args->value) {
+ tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ *total = args->total;
+ } else {
+ *tres = M_RES(mp)->tr_attrrm;
+ *total = XFS_ATTRRM_SPACE_RES(mp);
+ }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
STATIC int
xfs_attr_try_sf_addname(
struct xfs_inode *dp,
struct xfs_da_args *args)
{
- struct xfs_mount *mp = dp->i_mount;
- int error, error2;
+ int error;
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(args);
error = xfs_attr_shortform_addname(args);
if (error == -ENOSPC)
@@ -238,326 +363,793 @@ xfs_attr_try_sf_addname(
* Commit the shortform mods, and we're done.
* NOTE: this is also the error path (EEXIST, etc).
*/
- if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+ if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(dp->i_mount))
xfs_trans_set_sync(args->trans);
- error2 = xfs_trans_commit(args->trans);
- args->trans = NULL;
- return error ? error : error2;
+ return error;
+}
+
+static int
+xfs_attr_sf_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
+
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ ASSERT(!error || error == -EEXIST);
+ attr->xattri_dela_state = XFS_DAS_DONE;
+ goto out;
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args);
+ if (error)
+ return error;
+
+ attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+ trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
}
/*
- * Set the attribute specified in @args.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
*/
-int
-xfs_attr_set_args(
- struct xfs_da_args *args)
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+ struct xfs_attr_intent *attr,
+ enum xfs_delattr_state replace_state)
{
- struct xfs_inode *dp = args->dp;
- struct xfs_buf *leaf_bp = NULL;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
int error;
+ ASSERT(xfs_attr_is_leaf(args->dp));
+
/*
- * If the attribute list is non-existent or a shortform list,
- * upgrade it to a single-leaf-block attribute list.
+ * Use the leaf buffer we may already hold locked as a result of
+ * a sf-to-leaf conversion.
*/
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ error = xfs_attr_leaf_try_add(args);
+
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
/*
- * Build initial attribute list (if required).
+ * We're not in leaf format anymore, so roll the transaction and
+ * retry the add to the newly allocated node block.
*/
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
- xfs_attr_shortform_create(args);
+ attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+ goto out;
+ }
+ if (error)
+ return error;
+
+ /*
+ * We need to commit and roll if we need to allocate remote xattr blocks
+ * or perform more xattr manipulations. Otherwise there is nothing more
+ * to do and we can return success.
+ */
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
+out:
+ trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
+}
+/*
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
+ */
+static int
+xfs_attr_node_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ error = xfs_attr_node_addname_find_attr(attr);
+ if (error)
+ return error;
+
+ error = xfs_attr_node_try_addname(attr);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
/*
- * Try to add the attr to the attribute list in the inode.
+ * No state change, we really are in node form now
+ * but we need the transaction rolled to continue.
*/
- error = xfs_attr_try_sf_addname(dp, args);
- if (error != -ENOSPC)
+ goto out;
+ }
+ if (error)
+ return error;
+
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
+out:
+ trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+static int
+xfs_attr_rmtval_alloc(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
+
+ /*
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
+ */
+ if (attr->xattri_blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(attr);
+ if (error)
return error;
+ /* Roll the transaction only if there is more to allocate. */
+ if (attr->xattri_blkcnt > 0)
+ goto out;
+ }
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
+ return error;
+
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
+ /*
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
+ */
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ error = xfs_attr3_leaf_clearflag(args);
+out:
+ trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
+
+ /*
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
+ */
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
+
+ /*
+ * Mark the attribute as INCOMPLETE
+ */
+ return xfs_attr3_leaf_setflag(args);
+}
+
+/* Ensure the da state of an xattr deferred work item is ready to go. */
+static inline void
+xfs_attr_item_init_da_state(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+
+ if (!attr->xattri_da_state)
+ attr->xattri_da_state = xfs_da_state_alloc(args);
+ else
+ xfs_da_state_reset(attr->xattri_da_state, args);
+}
+
+/*
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
+ */
+static
+int xfs_attr_node_removename_setup(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state;
+ int error;
+
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ if (error != -EEXIST)
+ goto out;
+ error = 0;
+
+ state = attr->xattri_da_state;
+ ASSERT(state->path.blk[state->path.active - 1].bp != NULL);
+ ASSERT(state->path.blk[state->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
+
+ error = xfs_attr_leaf_mark_incomplete(args, state);
+ if (error)
+ goto out;
+ if (args->rmtblkno > 0)
+ error = xfs_attr_rmtval_invalidate(args);
+out:
+ if (error) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff;
+ int error;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
+
+ xfs_attr3_leaf_remove(bp, args);
+
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+
+ return error;
+}
+
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
+
+ if (!xfs_attr_is_leaf(dp))
+ return 0;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else {
+ xfs_trans_brelse(args->trans, bp);
+ }
+
+ return error;
+}
+
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete. Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
+
+ /* State machine switch */
+next_state:
+ switch (attr->xattri_dela_state) {
+ case XFS_DAS_UNINIT:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ case XFS_DAS_SF_ADD:
+ return xfs_attr_sf_addname(attr);
+ case XFS_DAS_LEAF_ADD:
+ return xfs_attr_leaf_addname(attr);
+ case XFS_DAS_NODE_ADD:
+ return xfs_attr_node_addname(attr);
+
+ case XFS_DAS_SF_REMOVE:
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_LEAF_REMOVE:
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_NODE_REMOVE:
+ error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
+ }
+ if (error)
+ return error;
+ attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+ if (args->rmtblkno == 0)
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_SET_RMT:
+ case XFS_DAS_NODE_SET_RMT:
+ error = xfs_attr_rmtval_find_space(attr);
+ if (error)
+ return error;
+ attr->xattri_dela_state++;
+ fallthrough;
+
+ case XFS_DAS_LEAF_ALLOC_RMT:
+ case XFS_DAS_NODE_ALLOC_RMT:
+ error = xfs_attr_rmtval_alloc(attr);
+ if (error)
+ return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ break;
+ goto next_state;
+
+ case XFS_DAS_LEAF_REPLACE:
+ case XFS_DAS_NODE_REPLACE:
/*
- * It won't fit in the shortform, transform to a leaf block.
- * GROT: another possible req'mt for a double-split btree op.
+ * We must "flip" the incomplete flags on the "new" and "old"
+ * attribute/value pairs so that one disappears and one appears
+ * atomically.
*/
- error = xfs_attr_shortform_to_leaf(args, &leaf_bp);
+ error = xfs_attr3_leaf_flipflags(args);
if (error)
return error;
+ /*
+ * We must commit the flag value change now to make it atomic
+ * and then we can start the next trans in series at REMOVE_OLD.
+ */
+ attr->xattri_dela_state++;
+ break;
+ case XFS_DAS_LEAF_REMOVE_OLD:
+ case XFS_DAS_NODE_REMOVE_OLD:
/*
- * Prevent the leaf buffer from being unlocked so that a
- * concurrent AIL push cannot grab the half-baked leaf
- * buffer and run into problems with the write verifier.
- * Once we're done rolling the transaction we can release
- * the hold and add the attr to the leaf.
+ * If we have a remote attr, start the process of removing it
+ * by invalidating any cached buffers.
+ *
+ * If we don't have a remote attr, we skip the remote block
+ * removal state altogether with a second state increment.
*/
- xfs_trans_bhold(args->trans, leaf_bp);
- error = xfs_defer_finish(&args->trans);
- xfs_trans_bhold_release(args->trans, leaf_bp);
- if (error) {
- xfs_trans_brelse(args->trans, leaf_bp);
- return error;
+ xfs_attr_restore_rmt_blk(args);
+ if (args->rmtblkno) {
+ error = xfs_attr_rmtval_invalidate(args);
+ if (error)
+ return error;
+ } else {
+ attr->xattri_dela_state++;
+ }
+
+ attr->xattri_dela_state++;
+ goto next_state;
+
+ case XFS_DAS_LEAF_REMOVE_RMT:
+ case XFS_DAS_NODE_REMOVE_RMT:
+ error = xfs_attr_rmtval_remove(attr);
+ if (error == -EAGAIN) {
+ error = 0;
+ break;
}
+ if (error)
+ return error;
+
+ /*
+ * We've finished removing the remote attr blocks, so commit the
+ * transaction and move on to removing the attr name from the
+ * leaf/node block. Removing the attr might require a full
+ * transaction reservation for btree block freeing, so we
+ * can't do that in the same transaction where we removed the
+ * remote attr blocks.
+ */
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_ATTR:
+ error = xfs_attr_leaf_remove_attr(attr);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+
+ case XFS_DAS_NODE_REMOVE_ATTR:
+ error = xfs_attr_node_remove_attr(attr);
+ if (!error)
+ error = xfs_attr_leaf_shrink(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ default:
+ ASSERT(0);
+ break;
}
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
- error = xfs_attr_leaf_addname(args);
- else
- error = xfs_attr_node_addname(args);
+ trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
return error;
}
+
/*
- * Remove the attribute specified in @args.
+ * Return EEXIST if attr is found, or ENOATTR if not
*/
-int
-xfs_attr_remove_args(
- struct xfs_da_args *args)
+static int
+xfs_attr_lookup(
+ struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ struct xfs_da_state *state;
int error;
- if (!xfs_inode_hasattr(dp)) {
- error = -ENOATTR;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
- error = xfs_attr_shortform_remove(args);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_removename(args);
- } else {
- error = xfs_attr_node_removename(args);
+ if (!xfs_inode_hasattr(dp))
+ return -ENOATTR;
+
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_attr_sf_findname(args, NULL, NULL);
+
+ if (xfs_attr_is_leaf(dp)) {
+ error = xfs_attr_leaf_hasname(args, &bp);
+
+ if (bp)
+ xfs_trans_brelse(args->trans, bp);
+
+ return error;
}
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
+ xfs_da_state_free(state);
return error;
}
-int
-xfs_attr_set(
- struct xfs_inode *dp,
- const unsigned char *name,
- size_t namelen,
- unsigned char *value,
- int valuelen,
- int flags)
+static int
+xfs_attr_intent_init(
+ struct xfs_da_args *args,
+ unsigned int op_flags, /* op flag (set or remove) */
+ struct xfs_attr_intent **attr) /* new xfs_attr_intent */
{
- struct xfs_mount *mp = dp->i_mount;
- struct xfs_da_args args;
- struct xfs_trans_res tres;
- int rsvd = (flags & ATTR_ROOT) != 0;
- int error, local;
- XFS_STATS_INC(mp, xs_attr_set);
+ struct xfs_attr_intent *new;
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return -EIO;
+ new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new->xattri_op_flags = op_flags;
+ new->xattri_da_args = args;
- error = xfs_attr_args_init(&args, dp, name, namelen, flags);
- if (error)
- return error;
+ *attr = new;
+ return 0;
+}
- args.value = value;
- args.valuelen = valuelen;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
- args.total = xfs_attr_calc_size(&args, &local);
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_intent *new;
+ int error = 0;
- error = xfs_qm_dqattach(dp);
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
if (error)
return error;
- /*
- * If the inode doesn't have an attribute fork, add one.
- * (inode must not be locked when we call this routine)
- */
- if (XFS_IFORK_Q(dp) == 0) {
- int sf_size = sizeof(xfs_attr_sf_hdr_t) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
- error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
- if (error)
- return error;
- }
+ return 0;
+}
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_intent *new;
+ int error = 0;
- /*
- * Root fork attributes can use reserved data blocks for this
- * operation if necessary
- */
- error = xfs_trans_alloc(mp, &tres, args.total, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
if (error)
return error;
- xfs_ilock(dp, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
- rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
- xfs_trans_ijoin(args.trans, dp, 0);
- error = xfs_attr_set_args(&args);
- if (error)
- goto out_trans_cancel;
- if (!args.trans) {
- /* shortform attribute has already been committed */
- goto out_unlock;
- }
+ return 0;
+}
- /*
- * If this is a synchronous mount, make sure that the
- * transaction goes to disk before returning to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+ struct xfs_da_args *args)
+{
+
+ struct xfs_attr_intent *new;
+ int error;
- if ((flags & ATTR_KERNOTIME) == 0)
- xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
+ if (error)
+ return error;
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans);
-out_unlock:
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
-out_trans_cancel:
- if (args.trans)
- xfs_trans_cancel(args.trans);
- goto out_unlock;
+ return 0;
}
/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
+ * Note: If args->value is NULL the attribute will be removed, just like the
+ * Linux ->setattr API.
*/
int
-xfs_attr_remove(
- struct xfs_inode *dp,
- const unsigned char *name,
- size_t namelen,
- int flags)
+xfs_attr_set(
+ struct xfs_da_args *args)
{
+ struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_da_args args;
- int error;
-
- XFS_STATS_INC(mp, xs_attr_remove);
+ struct xfs_trans_res tres;
+ bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
+ int error, local;
+ int rmt_blks = 0;
+ unsigned int total;
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, dp, name, namelen, flags);
+ error = xfs_qm_dqattach(dp);
if (error)
return error;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+
/*
- * we have no control over the attribute names that userspace passes us
+ * We have no control over the attribute names that userspace passes us
* to remove, so we have to allow the name lookup prior to attribute
- * removal to fail.
+ * removal to fail as well. Preserve the logged flag, since we need
+ * to pass that through to the logging code.
*/
- args.op_flags = XFS_DA_OP_OKNOENT;
+ args->op_flags = XFS_DA_OP_OKNOENT |
+ (args->op_flags & XFS_DA_OP_LOGGED);
- error = xfs_qm_dqattach(dp);
- if (error)
- return error;
+ if (args->value) {
+ XFS_STATS_INC(mp, xs_attr_set);
+ args->total = xfs_attr_calc_size(args, &local);
+
+ /*
+ * If the inode doesn't have an attribute fork, add one.
+ * (inode must not be locked when we call this routine)
+ */
+ if (xfs_inode_has_attr_fork(dp) == 0) {
+ int sf_size = sizeof(struct xfs_attr_sf_hdr) +
+ xfs_attr_sf_entsize_byname(args->namelen,
+ args->valuelen);
+
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
+ }
+
+ if (!local)
+ rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ } else {
+ XFS_STATS_INC(mp, xs_attr_remove);
+ rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+ }
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
- XFS_ATTRRM_SPACE_RES(mp), 0,
- (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
- &args.trans);
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
return error;
- xfs_ilock(dp, XFS_ILOCK_EXCL);
- /*
- * No need to make quota reservations here. We expect to release some
- * blocks not allocate in the common case.
- */
- xfs_trans_ijoin(args.trans, dp, 0);
+ if (args->value || xfs_inode_hasattr(dp)) {
+ error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(args->trans, dp,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error)
+ goto out_trans_cancel;
+ }
- error = xfs_attr_remove_args(&args);
+ error = xfs_attr_lookup(args);
+ switch (error) {
+ case -EEXIST:
+ /* if no value, we are performing a remove operation */
+ if (!args->value) {
+ error = xfs_attr_defer_remove(args);
+ break;
+ }
+ /* Pure create fails if the attr already exists */
+ if (args->attr_flags & XATTR_CREATE)
+ goto out_trans_cancel;
+
+ error = xfs_attr_defer_replace(args);
+ break;
+ case -ENOATTR:
+ /* Can't remove what isn't there. */
+ if (!args->value)
+ goto out_trans_cancel;
+
+ /* Pure replace fails if no existing attr to replace. */
+ if (args->attr_flags & XATTR_REPLACE)
+ goto out_trans_cancel;
+
+ error = xfs_attr_defer_add(args);
+ break;
+ default:
+ goto out_trans_cancel;
+ }
if (error)
- goto out;
+ goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
+ if (xfs_has_wsync(mp))
+ xfs_trans_set_sync(args->trans);
- if ((flags & ATTR_KERNOTIME) == 0)
- xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+ if (!(args->op_flags & XFS_DA_OP_NOTIME))
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
*/
- xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(args->trans);
+out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
return error;
-out:
- if (args.trans)
- xfs_trans_cancel(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+out_trans_cancel:
+ if (args->trans)
+ xfs_trans_cancel(args->trans);
+ goto out_unlock;
}
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
+static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+{
+ struct xfs_attr_shortform *sf;
+
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+ return be16_to_cpu(sf->hdr.totsize);
+}
+
/*
* Add a name to the shortform attribute list structure
* This is the external routine.
*/
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
+static int
+xfs_attr_shortform_addname(
+ struct xfs_da_args *args)
{
- int newsize, forkoff, retval;
+ int newsize, forkoff;
+ int error;
trace_xfs_attr_sf_addname(args);
- retval = xfs_attr_shortform_lookup(args);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
- return retval;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE)
- return retval;
- retval = xfs_attr_shortform_remove(args);
- if (retval)
- return retval;
+ error = xfs_attr_shortform_lookup(args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ return error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ return error;
+
+ error = xfs_attr_sf_removename(args);
+ if (error)
+ return error;
+
/*
- * Since we have removed the old attr, clear ATTR_REPLACE so
- * that the leaf format add routine won't trip over the attr
- * not being around.
+ * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+ * so that the new attr doesn't fit in shortform format, the
+ * leaf format add routine won't trip over the attr not being
+ * around.
*/
- args->flags &= ~ATTR_REPLACE;
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ break;
+ case 0:
+ break;
+ default:
+ return error;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return -ENOSPC;
- newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
- newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ newsize = xfs_attr_sf_totsize(args->dp);
+ newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
if (!forkoff)
@@ -572,183 +1164,106 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* External routines when attribute list is one block
*========================================================================*/
+/* Save the current remote block info and clear the current pointers. */
+static void
+xfs_attr_save_rmt_blk(
+ struct xfs_da_args *args)
+{
+ args->blkno2 = args->blkno;
+ args->index2 = args->index;
+ args->rmtblkno2 = args->rmtblkno;
+ args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
+}
+
+/* Set stored info about a remote block */
+static void
+xfs_attr_restore_rmt_blk(
+ struct xfs_da_args *args)
+{
+ args->blkno = args->blkno2;
+ args->index = args->index2;
+ args->rmtblkno = args->rmtblkno2;
+ args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
+}
+
/*
- * Add a name to the leaf attribute list structure
+ * Tries to add an attribute to an inode in leaf form
*
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
+ * This function is meant to execute as part of a delayed operation and leaves
+ * the transaction handling to the caller. On success the attribute is added
+ * and the inode and transaction are left dirty. If there is not enough space,
+ * the attr data is converted to node format and -ENOSPC is returned. Caller is
+ * responsible for handling the dirty inode and transaction or adding the attr
+ * in node format.
*/
STATIC int
-xfs_attr_leaf_addname(
+xfs_attr_leaf_try_add(
struct xfs_da_args *args)
{
- struct xfs_inode *dp;
struct xfs_buf *bp;
- int retval, error, forkoff;
-
- trace_xfs_attr_leaf_addname(args);
+ int error;
- /*
- * Read the (only) block in the attribute list in.
- */
- dp = args->dp;
- args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
if (error)
return error;
/*
- * Look up the given attribute in the leaf block. Figure out if
- * the given flags produce an error or call for an atomic rename.
+ * Look up the xattr name to set the insertion point for the new xattr.
*/
- retval = xfs_attr3_leaf_lookup_int(bp, args);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
- xfs_trans_brelse(args->trans, bp);
- return retval;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE) { /* pure create op */
- xfs_trans_brelse(args->trans, bp);
- return retval;
- }
+ error = xfs_attr3_leaf_lookup_int(bp, args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto out_brelse;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ goto out_brelse;
trace_xfs_attr_leaf_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
- args->blkno2 = args->blkno; /* set 2nd entry info*/
- args->index2 = args->index;
- args->rmtblkno2 = args->rmtblkno;
- args->rmtblkcnt2 = args->rmtblkcnt;
- args->rmtvaluelen2 = args->rmtvaluelen;
-
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto out_brelse;
}
- /*
- * Add the attribute to the leaf block, transitioning to a Btree
- * if required.
- */
- retval = xfs_attr3_leaf_add(bp, args);
- if (retval == -ENOSPC) {
- /*
- * Promote the attribute list to the Btree format, then
- * Commit that transaction so that the node_addname() call
- * can manage its own transactions.
- */
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
+ return xfs_attr3_leaf_add(bp, args);
- /*
- * Commit the current trans (including the inode) and start
- * a new one.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- return error;
+out_brelse:
+ xfs_trans_brelse(args->trans, bp);
+ return error;
+}
- /*
- * Fob the whole rest of the problem off on the Btree code.
- */
- error = xfs_attr_node_addname(args);
- return error;
- }
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+STATIC int
+xfs_attr_leaf_hasname(
+ struct xfs_da_args *args,
+ struct xfs_buf **bp)
+{
+ int error = 0;
- /*
- * Commit the transaction that added the attr name so that
- * later routines can manage their own transactions.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
if (error)
return error;
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_set(args);
- if (error)
- return error;
- }
+ error = xfs_attr3_leaf_lookup_int(*bp, args);
+ if (error != -ENOATTR && error != -EEXIST)
+ xfs_trans_brelse(args->trans, *bp);
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- */
- if (args->op_flags & XFS_DA_OP_RENAME) {
- /*
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
-
- /*
- * Dismantle the "old" attribute/value pair by removing
- * a "remote" value (if it exists).
- */
- args->index = args->index2;
- args->blkno = args->blkno2;
- args->rmtblkno = args->rmtblkno2;
- args->rmtblkcnt = args->rmtblkcnt2;
- args->rmtvaluelen = args->rmtvaluelen2;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
- }
-
- /*
- * Read in the block containing the "old" attr, then
- * remove the "old" attr from that block (neat, huh!)
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
- if (error)
- return error;
-
- xfs_attr3_leaf_remove(bp, args);
-
- /*
- * If the result is small enough, shrink it all into the inode.
- */
- if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
- }
-
- /*
- * Commit the remove and start the next trans in series.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
-
- } else if (args->rmtblkno > 0) {
- /*
- * Added a "remote" value, just clear the incomplete flag.
- */
- error = xfs_attr3_leaf_clearflag(args);
- }
return error;
}
@@ -772,31 +1287,26 @@ xfs_attr_leaf_removename(
* Remove the attribute.
*/
dp = args->dp;
- args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
- if (error)
- return error;
- error = xfs_attr3_leaf_lookup_int(bp, args);
+ error = xfs_attr_leaf_hasname(args, &bp);
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
+ return error;
+ } else if (error != -EEXIST)
return error;
- }
xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
- if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ return xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
- }
+
return 0;
}
@@ -816,118 +1326,117 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
trace_xfs_attr_leaf_get(args);
- args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
- if (error)
- return error;
+ error = xfs_attr_leaf_hasname(args, &bp);
- error = xfs_attr3_leaf_lookup_int(bp, args);
- if (error != -EEXIST) {
+ if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
return error;
- }
+ } else if (error != -EEXIST)
+ return error;
+
+
error = xfs_attr3_leaf_getvalue(bp, args);
xfs_trans_brelse(args->trans, bp);
return error;
}
-/*========================================================================
- * External routines when attribute list size > geo->blksize
- *========================================================================*/
-
-/*
- * Add a name to a Btree-format attribute list.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- */
+/* Return EEXIST if attr is found, or ENOATTR if not. */
STATIC int
-xfs_attr_node_addname(
- struct xfs_da_args *args)
+xfs_attr_node_lookup(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
{
- struct xfs_da_state *state;
- struct xfs_da_state_blk *blk;
- struct xfs_inode *dp;
- struct xfs_mount *mp;
int retval, error;
- trace_xfs_attr_node_addname(args);
-
/*
- * Fill in bucket of arguments/results/context to carry around.
+ * Search to see if name exists, and get back a pointer to it.
*/
- dp = args->dp;
- mp = dp->i_mount;
-restart:
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = mp;
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error)
+ return error;
+
+ return retval;
+}
+
+/*========================================================================
+ * External routines when attribute list size > geo->blksize
+ *========================================================================*/
+
+STATIC int
+xfs_attr_node_addname_find_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error)
- goto out;
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
- goto out;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE)
- goto out;
-
- trace_xfs_attr_node_replace(args);
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ goto error;
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
- args->blkno2 = args->blkno; /* set 2nd entry info*/
- args->index2 = args->index;
- args->rmtblkno2 = args->rmtblkno;
- args->rmtblkcnt2 = args->rmtblkcnt;
- args->rmtvaluelen2 = args->rmtvaluelen;
+ trace_xfs_attr_node_replace(args);
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto error;
+ }
+
+ return 0;
+error:
+ if (attr->xattri_da_state) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
}
+ return error;
+}
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+static int
+xfs_attr_node_try_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_state *state = attr->xattri_da_state;
+ struct xfs_da_state_blk *blk;
+ int error;
+
+ trace_xfs_attr_node_addname(state->args);
+
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr3_leaf_add(blk->bp, state->args);
- if (retval == -ENOSPC) {
+ error = xfs_attr3_leaf_add(blk->bp, state->args);
+ if (error == -ENOSPC) {
if (state->path.active == 1) {
/*
* Its really a single leaf node, but it had
* out-of-line values so it looked like it *might*
- * have been a b-tree.
+ * have been a b-tree. Let the caller deal with this.
*/
- xfs_da_state_free(state);
- state = NULL;
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
-
- /*
- * Commit the node conversion and start the next
- * trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
-
- goto restart;
+ goto out;
}
/*
@@ -939,9 +1448,6 @@ restart:
error = xfs_da3_split(state);
if (error)
goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -949,204 +1455,51 @@ restart:
xfs_da3_fixhashpath(state, &state->path);
}
- /*
- * Kill the state structure, we're done with it and need to
- * allow the buffers to come back later.
- */
+out:
xfs_da_state_free(state);
- state = NULL;
-
- /*
- * Commit the leaf addition or btree split and start the next
- * trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
+ attr->xattri_da_state = NULL;
+ return error;
+}
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_set(args);
- if (error)
- return error;
- }
+static int
+xfs_attr_node_removename(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ struct xfs_da_state_blk *blk;
+ int retval;
/*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
+ * Remove the name and update the hashvals in the tree.
*/
- if (args->op_flags & XFS_DA_OP_RENAME) {
- /*
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- goto out;
-
- /*
- * Dismantle the "old" attribute/value pair by removing
- * a "remote" value (if it exists).
- */
- args->index = args->index2;
- args->blkno = args->blkno2;
- args->rmtblkno = args->rmtblkno2;
- args->rmtblkcnt = args->rmtblkcnt2;
- args->rmtvaluelen = args->rmtvaluelen2;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
- }
-
- /*
- * Re-find the "old" attribute entry after any split ops.
- * The INCOMPLETE flag means that we will find the "old"
- * attr, not the "new" one.
- */
- args->op_flags |= XFS_DA_OP_INCOMPLETE;
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = mp;
- state->inleaf = 0;
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error)
- goto out;
-
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
-
- /*
- * Check to see if the tree needs to be collapsed.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
- }
-
- /*
- * Commit and start the next trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
-
- } else if (args->rmtblkno > 0) {
- /*
- * Added a "remote" value, just clear the incomplete flag.
- */
- error = xfs_attr3_leaf_clearflag(args);
- if (error)
- goto out;
- }
- retval = error = 0;
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
-out:
- if (state)
- xfs_da_state_free(state);
- if (error)
- return error;
return retval;
}
-/*
- * Remove a name from a B-tree attribute list.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_attr_node_removename(
- struct xfs_da_args *args)
+static int
+xfs_attr_node_remove_attr(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_state *state;
- struct xfs_da_state_blk *blk;
- struct xfs_inode *dp;
- struct xfs_buf *bp;
- int retval, error, forkoff;
-
- trace_xfs_attr_node_removename(args);
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state = xfs_da_state_alloc(args);
+ int retval = 0;
+ int error = 0;
/*
- * Tie a string around our finger to remind us where we are.
- */
- dp = args->dp;
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = dp->i_mount;
-
- /*
- * Search to see if name exists, and get back a pointer to it.
+ * The attr we are removing has already been marked incomplete, so
+ * we need to set the filter appropriately to re-find the "old"
+ * attribute entry after any split ops.
*/
+ args->attr_filter |= XFS_ATTR_INCOMPLETE;
error = xfs_da3_node_lookup_int(state, &retval);
- if (error || (retval != -EEXIST)) {
- if (error == 0)
- error = retval;
+ if (error)
goto out;
- }
- /*
- * If there is an out-of-line value, de-allocate the blocks.
- * This is done before we remove the attribute so that we don't
- * overflow the maximum size of a transaction and/or hit a deadlock.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->bp != NULL);
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- if (args->rmtblkno > 0) {
- /*
- * Fill in disk block numbers in the state structure
- * so that we can get the buffers back after we commit
- * several transactions in the following calls.
- */
- error = xfs_attr_fillstate(state);
- if (error)
- goto out;
-
- /*
- * Mark the attribute as INCOMPLETE, then bunmapi() the
- * remote value.
- */
- error = xfs_attr3_leaf_setflag(args);
- if (error)
- goto out;
- error = xfs_attr_rmtval_remove(args);
- if (error)
- goto out;
-
- /*
- * Refill the state structure with buffers, the prior calls
- * released our buffers.
- */
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- }
-
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
+ error = xfs_attr_node_removename(args, state);
/*
* Check to see if the tree needs to be collapsed.
@@ -1155,150 +1508,14 @@ xfs_attr_node_removename(
error = xfs_da3_join(state);
if (error)
goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
- /*
- * Commit the Btree join operation and start a new trans.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
- }
-
- /*
- * If the result is small enough, push it all into the inode.
- */
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- /*
- * Have to get rid of the copy of this dabuf in the state.
- */
- ASSERT(state->path.active == 1);
- ASSERT(state->path.blk[0].bp);
- state->path.blk[0].bp = NULL;
-
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- goto out;
-
- if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- if (error)
- goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
- } else
- xfs_trans_brelse(args->trans, bp);
}
- error = 0;
+ retval = error = 0;
out:
xfs_da_state_free(state);
- return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level;
-
- trace_xfs_attr_fillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- /*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
- */
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = XFS_BUF_ADDR(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level, error;
-
- trace_xfs_attr_refillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
-
- /*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
- */
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
-
- return 0;
+ if (error)
+ return error;
+ return retval;
}
/*
@@ -1311,35 +1528,29 @@ xfs_attr_refillstate(xfs_da_state_t *state)
* Returns 0 on successful retrieval, otherwise an error.
*/
STATIC int
-xfs_attr_node_get(xfs_da_args_t *args)
+xfs_attr_node_get(
+ struct xfs_da_args *args)
{
- xfs_da_state_t *state;
- xfs_da_state_blk_t *blk;
- int error, retval;
- int i;
+ struct xfs_da_state *state;
+ struct xfs_da_state_blk *blk;
+ int i;
+ int error;
trace_xfs_attr_node_get(args);
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
-
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error) {
- retval = error;
- goto out_release;
- }
- if (retval != -EEXIST)
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
+ if (error != -EEXIST)
goto out_release;
/*
* Get the value, local or "remote"
*/
blk = &state->path.blk[state->path.active - 1];
- retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+ error = xfs_attr3_leaf_getvalue(blk->bp, args);
/*
* If not in a transaction, we have to release all the buffers.
@@ -1351,7 +1562,7 @@ out_release:
}
xfs_da_state_free(state);
- return retval;
+ return error;
}
/* Returns true if the attribute entry name is valid. */
@@ -1370,3 +1581,20 @@ xfs_attr_namecheck(
/* There shouldn't be any nulls here */
return !memchr(name, 0, length);
}
+
+int __init
+xfs_attr_intent_init_cache(void)
+{
+ xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent",
+ sizeof(struct xfs_attr_intent),
+ 0, 0, NULL);
+
+ return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attr_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attr_intent_cache);
+ xfs_attr_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 4243b2272642..81be9b3e4004 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -21,39 +21,6 @@ struct xfs_attr_list_context;
* as possible so as to fit into the literal area of the inode.
*/
-/*========================================================================
- * External interfaces
- *========================================================================*/
-
-
-#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */
-#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
-#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */
-#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
-#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
-#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
-
-#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
-#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
-
-#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
-#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */
-
-#define ATTR_KERNEL_FLAGS \
- (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC)
-
-#define XFS_ATTR_FLAGS \
- { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
- { ATTR_ROOT, "ROOT" }, \
- { ATTR_TRUST, "TRUST" }, \
- { ATTR_SECURE, "SECURE" }, \
- { ATTR_CREATE, "CREATE" }, \
- { ATTR_REPLACE, "REPLACE" }, \
- { ATTR_KERNOTIME, "KERNOTIME" }, \
- { ATTR_KERNOVAL, "KERNOVAL" }, \
- { ATTR_INCOMPLETE, "INCOMPLETE" }, \
- { ATTR_ALLOC, "ALLOC" }
-
/*
* The maximum size (into the kernel or returned from the kernel) of an
* attribute value or the buffer used for an attr_list() call. Larger
@@ -62,45 +29,16 @@ struct xfs_attr_list_context;
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
/*
- * Define how lists of attribute names are returned to the user from
- * the attr_list() call. A large, 32bit aligned, buffer is passed in
- * along with its size. We put an array of offsets at the top that each
- * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
- */
-typedef struct attrlist {
- __s32 al_count; /* number of entries in attrlist */
- __s32 al_more; /* T/F: more attrs (do call again) */
- __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
-} attrlist_t;
-
-/*
- * Show the interesting info about one attribute. This is what the
- * al_offset[i] entry points to.
- */
-typedef struct attrlist_ent { /* data from attr_list() */
- __u32 a_valuelen; /* number bytes in value of attr */
- char a_name[1]; /* attr name (NULL terminated) */
-} attrlist_ent_t;
-
-/*
- * Given a pointer to the (char*) buffer containing the attr_list() result,
- * and an index, return a pointer to the indicated attribute in the buffer.
- */
-#define ATTR_ENTRY(buffer, index) \
- ((attrlist_ent_t *) \
- &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
-
-/*
* Kernel-internal version of the attrlist cursor.
*/
-typedef struct attrlist_cursor_kern {
+struct xfs_attrlist_cursor_kern {
__u32 hashval; /* hash value of next entry to add */
__u32 blkno; /* block containing entry (suggestion) */
__u32 offset; /* offset in list of equal-hashvals */
__u16 pad1; /* padding to match user-level */
__u8 pad2; /* padding to match user-level */
__u8 initted; /* T/F: cursor has been initialized */
-} attrlist_cursor_kern_t;
+};
/*========================================================================
@@ -112,27 +50,484 @@ typedef struct attrlist_cursor_kern {
typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
unsigned char *, int, int);
-typedef struct xfs_attr_list_context {
- struct xfs_trans *tp;
- struct xfs_inode *dp; /* inode */
- struct attrlist_cursor_kern *cursor; /* position in list */
- char *alist; /* output buffer */
+struct xfs_attr_list_context {
+ struct xfs_trans *tp;
+ struct xfs_inode *dp; /* inode */
+ struct xfs_attrlist_cursor_kern cursor; /* position in list */
+ void *buffer; /* output buffer */
/*
* Abort attribute list iteration if non-zero. Can be used to pass
* error values to the xfs_attr_list caller.
*/
- int seen_enough;
+ int seen_enough;
+ bool allow_incomplete;
- ssize_t count; /* num used entries */
- int dupcnt; /* count dup hashvals seen */
- int bufsize; /* total buffer size */
- int firstu; /* first used byte in buffer */
- int flags; /* from VOP call */
- int resynch; /* T/F: resynch with cursor */
- put_listent_func_t put_listent; /* list output fmt function */
- int index; /* index into output buffer */
-} xfs_attr_list_context_t;
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */
+ int resynch; /* T/F: resynch with cursor */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+};
+
+
+/*
+ * ========================================================================
+ * Structure used to pass context around among the delayed routines.
+ * ========================================================================
+ */
+
+/*
+ * Below is a state machine diagram for attr remove operations. The XFS_DAS_*
+ * states indicate places where the function would return -EAGAIN, and then
+ * immediately resume from after being called by the calling function. States
+ * marked as a "subroutine state" indicate that they belong to a subroutine, and
+ * so the calling function needs to pass them back to that subroutine to allow
+ * it to finish where it left off. But they otherwise do not have a role in the
+ * calling function other than just passing through.
+ *
+ * xfs_attr_remove_iter()
+ * │
+ * v
+ * have attr to remove? ──n──> done
+ * │
+ * y
+ * │
+ * v
+ * are we short form? ──y──> xfs_attr_shortform_remove ──> done
+ * │
+ * n
+ * │
+ * V
+ * are we leaf form? ──y──> xfs_attr_leaf_removename ──> done
+ * │
+ * n
+ * │
+ * V
+ * ┌── need to setup state?
+ * │ │
+ * n y
+ * │ │
+ * │ v
+ * │ find attr and get state
+ * │ attr has remote blks? ──n─┐
+ * │ │ v
+ * │ │ find and invalidate
+ * │ y the remote blocks.
+ * │ │ mark attr incomplete
+ * │ ├────────────────┘
+ * └──────────┤
+ * │
+ * v
+ * Have remote blks to remove? ───y─────┐
+ * │ ^ remove the blks
+ * │ │ │
+ * │ │ v
+ * │ XFS_DAS_RMTBLK <─n── done?
+ * │ re-enter with │
+ * │ one less blk to y
+ * │ remove │
+ * │ V
+ * │ refill the state
+ * n │
+ * │ v
+ * │ XFS_DAS_RM_NAME
+ * │ │
+ * ├─────────────────────────┘
+ * │
+ * v
+ * remove leaf and
+ * update hash with
+ * xfs_attr_node_remove_cleanup
+ * │
+ * v
+ * need to
+ * shrink tree? ─n─┐
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * join leaf │
+ * │ │
+ * v │
+ * XFS_DAS_RM_SHRINK │
+ * │ │
+ * v │
+ * do the shrink │
+ * │ │
+ * v │
+ * free state <──┘
+ * │
+ * v
+ * done
+ *
+ *
+ * Below is a state machine diagram for attr set operations.
+ *
+ * It seems the challenge with understanding this system comes from trying to
+ * absorb the state machine all at once, when really one should only be looking
+ * at it with in the context of a single function. Once a state sensitive
+ * function is called, the idea is that it "takes ownership" of the
+ * state machine. It isn't concerned with the states that may have belonged to
+ * it's calling parent. Only the states relevant to itself or any other
+ * subroutines there in. Once a calling function hands off the state machine to
+ * a subroutine, it needs to respect the simple rule that it doesn't "own" the
+ * state machine anymore, and it's the responsibility of that calling function
+ * to propagate the -EAGAIN back up the call stack. Upon reentry, it is
+ * committed to re-calling that subroutine until it returns something other than
+ * -EAGAIN. Once that subroutine signals completion (by returning anything other
+ * than -EAGAIN), the calling function can resume using the state machine.
+ *
+ * xfs_attr_set_iter()
+ * │
+ * v
+ * ┌─y─ has an attr fork?
+ * │ |
+ * │ n
+ * │ |
+ * │ V
+ * │ add a fork
+ * │ │
+ * └──────────┤
+ * │
+ * V
+ * ┌─── is shortform?
+ * │ │
+ * │ y
+ * │ │
+ * │ V
+ * │ xfs_attr_set_fmt
+ * │ |
+ * │ V
+ * │ xfs_attr_try_sf_addname
+ * │ │
+ * │ V
+ * │ had enough ──y──> done
+ * │ space?
+ * n │
+ * │ n
+ * │ │
+ * │ V
+ * │ transform to leaf
+ * │ │
+ * │ V
+ * │ hold the leaf buffer
+ * │ │
+ * │ V
+ * │ return -EAGAIN
+ * │ Re-enter in
+ * │ leaf form
+ * │
+ * └─> release leaf buffer
+ * if needed
+ * │
+ * V
+ * ┌───n── fork has
+ * │ only 1 blk?
+ * │ │
+ * │ y
+ * │ │
+ * │ v
+ * │ xfs_attr_leaf_try_add()
+ * │ │
+ * │ v
+ * │ had enough ──────────────y─────────────┐
+ * │ space? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ return -EAGAIN │
+ * │ re-enter in │
+ * │ node form │
+ * │ │ │
+ * └──────────┤ │
+ * │ │
+ * V │
+ * xfs_attr_node_addname_find_attr │
+ * determines if this │
+ * is create or rename │
+ * find space to store attr │
+ * │ │
+ * v │
+ * xfs_attr_node_addname │
+ * │ │
+ * v │
+ * fits in a node leaf? ────n─────┐ │
+ * │ ^ v │
+ * │ │ single leaf node? │
+ * │ │ │ │ │
+ * y │ y n │
+ * │ │ │ │ │
+ * v │ v v │
+ * update │ grow the leaf split if │
+ * hashvals └── return -EAGAIN needed │
+ * │ retry leaf add │ │
+ * │ on reentry │ │
+ * ├────────────────────────────┘ │
+ * │ │
+ * v │
+ * need to alloc │
+ * ┌─y── or flip flag? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ done │
+ * │ │
+ * │ │
+ * │ XFS_DAS_FOUND_LBLK <────────────────┘
+ * │ │
+ * │ V
+ * │ xfs_attr_leaf_addname()
+ * │ │
+ * │ v
+ * │ ┌──first time through?
+ * │ │ │
+ * │ │ y
+ * │ │ │
+ * │ n v
+ * │ │ if we have rmt blks
+ * │ │ find space for them
+ * │ │ │
+ * │ └──────────┤
+ * │ │
+ * │ v
+ * │ still have
+ * │ ┌─n─ blks to alloc? <──┐
+ * │ │ │ │
+ * │ │ y │
+ * │ │ │ │
+ * │ │ v │
+ * │ │ alloc one blk │
+ * │ │ return -EAGAIN ──┘
+ * │ │ re-enter with one
+ * │ │ less blk to alloc
+ * │ │
+ * │ │
+ * │ └───> set the rmt
+ * │ value
+ * │ │
+ * │ v
+ * │ was this
+ * │ a rename? ──n─┐
+ * │ │ │
+ * │ y │
+ * │ │ │
+ * │ v │
+ * │ flip incomplete │
+ * │ flag │
+ * │ │ │
+ * │ v │
+ * │ XFS_DAS_FLIP_LFLAG │
+ * │ │ │
+ * │ v │
+ * │ need to remove │
+ * │ old bks? ──n──┤
+ * │ │ │
+ * │ y │
+ * │ │ │
+ * │ V │
+ * │ remove │
+ * │ ┌───> old blks │
+ * │ │ │ │
+ * │ XFS_DAS_RM_LBLK │ │
+ * │ ^ │ │
+ * │ │ v │
+ * │ └──y── more to │
+ * │ remove? │
+ * │ │ │
+ * │ n │
+ * │ │ │
+ * │ v │
+ * │ XFS_DAS_RD_LEAF │
+ * │ │ │
+ * │ v │
+ * │ remove leaf │
+ * │ │ │
+ * │ v │
+ * │ shrink to sf │
+ * │ if needed │
+ * │ │ │
+ * │ v │
+ * │ done <──────┘
+ * │
+ * └──────> XFS_DAS_FOUND_NBLK
+ * │
+ * v
+ * ┌─────n── need to
+ * │ alloc blks?
+ * │ │
+ * │ y
+ * │ │
+ * │ v
+ * │ find space
+ * │ │
+ * │ v
+ * │ ┌─>XFS_DAS_ALLOC_NODE
+ * │ │ │
+ * │ │ v
+ * │ │ alloc blk
+ * │ │ │
+ * │ │ v
+ * │ └──y── need to alloc
+ * │ more blocks?
+ * │ │
+ * │ n
+ * │ │
+ * │ v
+ * │ set the rmt value
+ * │ │
+ * │ v
+ * │ was this
+ * └────────> a rename? ──n─┐
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * flip incomplete │
+ * flag │
+ * │ │
+ * v │
+ * XFS_DAS_FLIP_NFLAG │
+ * │ │
+ * v │
+ * need to │
+ * remove blks? ─n──┤
+ * │ │
+ * y │
+ * │ │
+ * v │
+ * remove │
+ * ┌────────> old blks │
+ * │ │ │
+ * XFS_DAS_RM_NBLK │ │
+ * ^ │ │
+ * │ v │
+ * └──────y── more to │
+ * remove │
+ * │ │
+ * n │
+ * │ │
+ * v │
+ * XFS_DAS_CLR_FLAG │
+ * │ │
+ * v │
+ * clear flags │
+ * │ │
+ * ├──────────┘
+ * │
+ * v
+ * done
+ */
+
+/*
+ * Enum values for xfs_attr_intent.xattri_da_state
+ *
+ * These values are used by delayed attribute operations to keep track of where
+ * they were before they returned -EAGAIN. A return code of -EAGAIN signals the
+ * calling function to roll the transaction, and then call the subroutine to
+ * finish the operation. The enum is then used by the subroutine to jump back
+ * to where it was and resume executing where it left off.
+ */
+enum xfs_delattr_state {
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+
+ /*
+ * Initial sequence states. The replace setup code relies on the
+ * ADD and REMOVE states for a specific format to be sequential so
+ * that we can transform the initial operation to be performed
+ * according to the xfs_has_larp() state easily.
+ */
+ XFS_DAS_SF_ADD, /* Initial sf add state */
+ XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
+
+ XFS_DAS_LEAF_ADD, /* Initial leaf add state */
+ XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
+
+ XFS_DAS_NODE_ADD, /* Initial node add state */
+ XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
+
+ /* Leaf state set/replace/remove sequence */
+ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
+ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
+ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
+ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
+
+ /* Node state sequence, must match leaf state above */
+ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
+ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
+ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
+ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
+
+ XFS_DAS_DONE, /* finished operation */
+};
+
+#define XFS_DAS_STRINGS \
+ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
+ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
+ { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
+ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
+ { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
+ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
+ { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
+ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
+ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
+ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
+ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
+ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
+ { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
+ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
+ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
+ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
+ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
+ { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
+ { XFS_DAS_DONE, "XFS_DAS_DONE" }
+
+struct xfs_attri_log_nameval;
+
+/*
+ * Context used for keeping track of delayed attribute operations
+ */
+struct xfs_attr_intent {
+ /*
+ * used to log this item to an intent containing a list of attrs to
+ * commit later
+ */
+ struct list_head xattri_list;
+
+ /* Used in xfs_attr_node_removename to roll through removing blocks */
+ struct xfs_da_state *xattri_da_state;
+
+ struct xfs_da_args *xattri_da_args;
+
+ /*
+ * Shared buffer containing the attr name and value so that the logging
+ * code can share large memory buffers between log items.
+ */
+ struct xfs_attri_log_nameval *xattri_nameval;
+
+ /* Used to keep track of current state of delayed operation */
+ enum xfs_delattr_state xattri_dela_state;
+
+ /*
+ * Attr operation being performed - XFS_ATTRI_OP_FLAGS_*
+ */
+ unsigned int xattri_op_flags;
+
+ /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
+ xfs_dablk_t xattri_lblkno;
+ int xattri_blkcnt;
+ struct xfs_bmbt_irec xattri_map;
+};
/*========================================================================
@@ -143,21 +538,84 @@ typedef struct xfs_attr_list_context {
* Overall external interface routines.
*/
int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *);
-int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_attr_list_ilocked(struct xfs_attr_list_context *);
+int xfs_attr_list(struct xfs_attr_list_context *);
int xfs_inode_hasattr(struct xfs_inode *ip);
-int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- size_t namelen, unsigned char **value, int *valuelenp,
- int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
- size_t namelen, unsigned char *value, int valuelen, int flags);
-int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name,
- size_t namelen, int flags);
-int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
- int flags, struct attrlist_cursor_kern *cursor);
+bool xfs_attr_is_leaf(struct xfs_inode *ip);
+int xfs_attr_get_ilocked(struct xfs_da_args *args);
+int xfs_attr_get(struct xfs_da_args *args);
+int xfs_attr_set(struct xfs_da_args *args);
+int xfs_attr_set_iter(struct xfs_attr_intent *attr);
+int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
bool xfs_attr_namecheck(const void *name, size_t length);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+ unsigned int *total);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+ /*
+ * When called from the completion of a attr remove to determine the
+ * next state, the attribute fork may be null. This can occur only occur
+ * on a pure remove, but we grab the next state before we check if a
+ * replace operation is being performed. If we are called from any other
+ * context, i_af is guaranteed to exist. Hence if the attr fork is
+ * null, we were called from a pure remove operation and so we are done.
+ */
+ if (!xfs_inode_has_attr_fork(args->dp))
+ return XFS_DAS_DONE;
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_ADD;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_ADD;
+ return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_REMOVE;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_REMOVE;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_REMOVE;
+ return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+ if (args->op_flags & XFS_DA_OP_LOGGED)
+ return xfs_attr_init_remove_state(args);
+ return xfs_attr_init_add_state(args);
+}
+
+extern struct kmem_cache *xfs_attr_intent_cache;
+int __init xfs_attr_intent_init_cache(void);
+void xfs_attr_intent_destroy_cache(void);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index fed537a4353d..beee51ad75ce 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -19,14 +19,16 @@
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_errortag.h"
/*
@@ -287,6 +289,23 @@ xfs_attr3_leaf_verify_entry(
return NULL;
}
+/*
+ * Validate an attribute leaf block.
+ *
+ * Empty leaf blocks can occur under the following circumstances:
+ *
+ * 1. setxattr adds a new extended attribute to a file;
+ * 2. The file has zero existing attributes;
+ * 3. The attribute is too large to fit in the attribute fork;
+ * 4. The attribute is small enough to fit in a leaf block;
+ * 5. A log flush occurs after committing the transaction that creates
+ * the (empty) leaf block; and
+ * 6. The filesystem goes down after the log flush but before the new
+ * attribute can be committed to the leaf block.
+ *
+ * Hence we need to ensure that we don't fail the validation purely
+ * because the leaf is empty.
+ */
static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
@@ -309,14 +328,6 @@ xfs_attr3_leaf_verify(
return fa;
/*
- * In recovery there is a transient state where count == 0 is valid
- * because we may have transitioned an empty shortform attr to a leaf
- * if the attr didn't fit in shortform.
- */
- if (!xfs_log_in_recovery(mp) && ichdr.count == 0)
- return __this_address;
-
- /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -331,6 +342,13 @@ xfs_attr3_leaf_verify(
(char *)bp->b_addr + ichdr.firstused)
return __this_address;
+ /*
+ * NOTE: This verifier historically failed empty leaf buffers because
+ * we expect the fork to be in another format. Empty attr fork format
+ * conversions are possible during xattr set, however, and format
+ * conversion is not atomic with the xattr set that triggers it. We
+ * cannot assume leaf blocks are non-empty until that is addressed.
+ */
buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
for (i = 0, ent = entries; i < ichdr.count; ent++, i++) {
fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr,
@@ -384,7 +402,7 @@ xfs_attr3_leaf_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -406,7 +424,7 @@ xfs_attr3_leaf_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -446,13 +464,36 @@ xfs_attr3_leaf_read(
*========================================================================*/
/*
- * If namespace bits don't match return 0.
- * If all match then return 1.
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
*/
-STATIC int
-xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+static bool
+xfs_attr_match(
+ struct xfs_da_args *args,
+ uint8_t namelen,
+ unsigned char *name,
+ int flags)
{
- return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+
+ if (args->namelen != namelen)
+ return false;
+ if (memcmp(args->name, name, namelen) != 0)
+ return false;
+
+ /* Recovery ignores the INCOMPLETE flag. */
+ if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+ args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return true;
+
+ /* All remaining matches need to be filtered by INCOMPLETE state. */
+ if (args->attr_filter !=
+ (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
+ return false;
+ return true;
}
static int
@@ -464,7 +505,7 @@ xfs_attr_copy_value(
/*
* No copy if all we have to do is get the length
*/
- if (args->flags & ATTR_KERNOVAL) {
+ if (!args->valuelen) {
args->valuelen = valuelen;
return 0;
}
@@ -477,8 +518,8 @@ xfs_attr_copy_value(
return -ERANGE;
}
- if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
- args->value = kmem_alloc_large(valuelen, 0);
+ if (!args->value) {
+ args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP);
if (!args->value)
return -ENOMEM;
}
@@ -505,13 +546,13 @@ xfs_attr_copy_value(
*========================================================================*/
/*
- * Query whether the requested number of additional bytes of extended
+ * Query whether the total requested number of attr fork bytes of extended
* attribute space will be able to fit inline.
*
- * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * Returns zero if not, else the i_forkoff fork offset to be used in the
* literal area for attribute data once the new bytes have been added.
*
- * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
* special case for dev/uuid inodes, they have fixed size data forks.
*/
int
@@ -525,10 +566,16 @@ xfs_attr_shortform_bytesfit(
int maxforkoff;
int offset;
+ /*
+ * Check if the new size could fit at all first:
+ */
+ if (bytes > XFS_LITINO(mp))
+ return 0;
+
/* rounded down */
- offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+ offset = (XFS_LITINO(mp) - bytes) >> 3;
- if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) {
minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
return (offset >= minforkoff) ? minforkoff : 0;
}
@@ -543,29 +590,29 @@ xfs_attr_shortform_bytesfit(
* to real extents, or the delalloc conversion will take care of the
* literal area rebalancing.
*/
- if (bytes <= XFS_IFORK_ASIZE(dp))
- return dp->i_d.di_forkoff;
+ if (bytes <= xfs_inode_attr_fork_size(dp))
+ return dp->i_forkoff;
/*
* For attr2 we can try to move the forkoff if there is space in the
* literal area, but for the old format we are done if there is no
* space in the fixed attribute fork.
*/
- if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+ if (!xfs_has_attr2(mp))
return 0;
dsize = dp->i_df.if_bytes;
- switch (dp->i_d.di_format) {
+ switch (dp->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
/*
- * If there is no attr fork and the data fork is extents,
+ * If there is no attr fork and the data fork is extents,
* determine if creating the default attr fork will result
* in the extents form migrating to btree. If so, the
* minimum offset only needs to be the space required for
* the btree root.
*/
- if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+ if (!dp->i_forkoff && dp->i_df.if_bytes >
xfs_default_attroffset(dp))
dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
break;
@@ -576,10 +623,10 @@ xfs_attr_shortform_bytesfit(
* minforkoff to where the btree root can finish so we have
* plenty of room for attrs
*/
- if (dp->i_d.di_forkoff) {
- if (offset < dp->i_d.di_forkoff)
+ if (dp->i_forkoff) {
+ if (offset < dp->i_forkoff)
return 0;
- return dp->i_d.di_forkoff;
+ return dp->i_forkoff;
}
dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
break;
@@ -593,8 +640,7 @@ xfs_attr_shortform_bytesfit(
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
@@ -605,99 +651,134 @@ xfs_attr_shortform_bytesfit(
}
/*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
+ * - noattr2 mount option is set,
+ * - on-disk version bit says it is already set, or
+ * - the attr2 mount option is not set to enable automatic upgrade from attr1.
*/
STATIC void
-xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+xfs_sbversion_add_attr2(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp)
{
- if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
- !(xfs_sb_version_hasattr2(&mp->m_sb))) {
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
- xfs_sb_version_addattr2(&mp->m_sb);
- spin_unlock(&mp->m_sb_lock);
- xfs_log_sb(tp);
- } else
- spin_unlock(&mp->m_sb_lock);
- }
+ if (xfs_has_noattr2(mp))
+ return;
+ if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+ return;
+ if (!xfs_has_attr2(mp))
+ return;
+
+ spin_lock(&mp->m_sb_lock);
+ xfs_add_attr2(mp);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
}
/*
* Create the initial contents of a shortform attribute list.
*/
void
-xfs_attr_shortform_create(xfs_da_args_t *args)
+xfs_attr_shortform_create(
+ struct xfs_da_args *args)
{
- xfs_attr_sf_hdr_t *hdr;
- xfs_inode_t *dp;
- struct xfs_ifork *ifp;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *hdr;
trace_xfs_attr_sf_create(args);
- dp = args->dp;
- ASSERT(dp != NULL);
- ifp = dp->i_afp;
- ASSERT(ifp != NULL);
ASSERT(ifp->if_bytes == 0);
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
- ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */
- dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
- ifp->if_flags |= XFS_IFINLINE;
- } else {
- ASSERT(ifp->if_flags & XFS_IFINLINE);
- }
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
- hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
- hdr->count = 0;
+ hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+ memset(hdr, 0, sizeof(*hdr));
hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
/*
+ * Return -EEXIST if attr is found, or -ENOATTR if not
+ * args: args containing attribute name and namelen
+ * sfep: If not null, pointer will be set to the last attr entry found on
+ -EEXIST. On -ENOATTR pointer is left at the last entry in the list
+ * basep: If not null, pointer is set to the byte offset of the entry in the
+ * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of
+ * the last entry in the list
+ */
+int
+xfs_attr_sf_findname(
+ struct xfs_da_args *args,
+ struct xfs_attr_sf_entry **sfep,
+ unsigned int *basep)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ unsigned int base = sizeof(struct xfs_attr_sf_hdr);
+ int size = 0;
+ int end;
+ int i;
+
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
+ sfe = &sf->list[0];
+ end = sf->hdr.count;
+ for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
+ base += size, i++) {
+ size = xfs_attr_sf_entsize(sfe);
+ if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ continue;
+ break;
+ }
+
+ if (sfep != NULL)
+ *sfep = sfe;
+
+ if (basep != NULL)
+ *basep = base;
+
+ if (i == end)
+ return -ENOATTR;
+ return -EEXIST;
+}
+
+/*
* Add a name/value pair to the shortform attribute list.
* Overflow from the inode has already been checked for.
*/
void
-xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+xfs_attr_shortform_add(
+ struct xfs_da_args *args,
+ int forkoff)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- int i, offset, size;
- xfs_mount_t *mp;
- xfs_inode_t *dp;
- struct xfs_ifork *ifp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int offset, size;
+ struct xfs_mount *mp;
+ struct xfs_inode *dp;
+ struct xfs_ifork *ifp;
trace_xfs_attr_sf_add(args);
dp = args->dp;
mp = dp->i_mount;
- dp->i_d.di_forkoff = forkoff;
+ dp->i_forkoff = forkoff;
- ifp = dp->i_afp;
- ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-#ifdef DEBUG
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
+ ifp = &dp->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
ASSERT(0);
-#endif
- }
offset = (char *)sfe - (char *)sf;
- size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
- sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ sfe->flags = args->attr_filter;
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
sf->hdr.count++;
@@ -716,13 +797,10 @@ xfs_attr_fork_remove(
struct xfs_inode *ip,
struct xfs_trans *tp)
{
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- ip->i_d.di_forkoff = 0;
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-
- ASSERT(ip->i_d.di_anextents == 0);
- ASSERT(ip->i_afp == NULL);
+ ASSERT(ip->i_af.if_nextents == 0);
+ xfs_ifork_zap_attr(ip);
+ ip->i_forkoff = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -730,35 +808,35 @@ xfs_attr_fork_remove(
* Remove an attribute from the shortform attribute list structure.
*/
int
-xfs_attr_shortform_remove(xfs_da_args_t *args)
+xfs_attr_sf_removename(
+ struct xfs_da_args *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- int base, size=0, end, totsize, i;
- xfs_mount_t *mp;
- xfs_inode_t *dp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int size = 0, end, totsize;
+ unsigned int base;
+ struct xfs_mount *mp;
+ struct xfs_inode *dp;
+ int error;
trace_xfs_attr_sf_remove(args);
dp = args->dp;
mp = dp->i_mount;
- base = sizeof(xfs_attr_sf_hdr_t);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
- sfe = &sf->list[0];
- end = sf->hdr.count;
- for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
- base += size, i++) {
- size = XFS_ATTR_SF_ENTSIZE(sfe);
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- break;
- }
- if (i == end)
- return -ENOATTR;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
+
+ error = xfs_attr_sf_findname(args, &sfe, &base);
+
+ /*
+ * If we are recovering an operation, finding nothing to
+ * remove is not an error - it just means there was nothing
+ * to clean up.
+ */
+ if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+ return 0;
+ if (error != -EEXIST)
+ return error;
+ size = xfs_attr_sf_entsize(sfe);
/*
* Fix up the attribute fork data, covering the hole
@@ -774,19 +852,18 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
* Fix up the start offset of the attribute fork
*/
totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
- (mp->m_flags & XFS_MOUNT_ATTR2) &&
- (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+ (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
- dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
- ASSERT(dp->i_d.di_forkoff);
+ dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+ ASSERT(dp->i_forkoff);
ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
- !(mp->m_flags & XFS_MOUNT_ATTR2) ||
- dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ !xfs_has_attr2(mp) ||
+ dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -803,26 +880,22 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
int
xfs_attr_shortform_lookup(xfs_da_args_t *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
int i;
struct xfs_ifork *ifp;
trace_xfs_attr_sf_lookup(args);
- ifp = args->dp->i_afp;
- ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ ifp = &args->dp->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- return -EEXIST;
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return -EEXIST;
}
return -ENOATTR;
}
@@ -830,9 +903,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
/*
* Retrieve the attribute value and length.
*
- * If ATTR_KERNOVAL is specified, only the length needs to be returned.
- * Unlike a lookup, we only return an error if the attribute does not
- * exist or we can't retrieve the value.
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
*/
int
xfs_attr_shortform_getvalue(
@@ -842,31 +915,23 @@ xfs_attr_shortform_getvalue(
struct xfs_attr_sf_entry *sfe;
int i;
- ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+ ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
- sfe->valuelen);
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return xfs_attr_copy_value(args,
+ &sfe->nameval[args->namelen], sfe->valuelen);
}
return -ENOATTR;
}
-/*
- * Convert from using the shortform to the leaf. On success, return the
- * buffer so that we can keep it locked until we're totally done with it.
- */
+/* Convert from using the shortform to the leaf format. */
int
xfs_attr_shortform_to_leaf(
- struct xfs_da_args *args,
- struct xfs_buf **leaf_bp)
+ struct xfs_da_args *args)
{
struct xfs_inode *dp;
struct xfs_attr_shortform *sf;
@@ -881,13 +946,13 @@ xfs_attr_shortform_to_leaf(
trace_xfs_attr_sf_to_leaf(args);
dp = args->dp;
- ifp = dp->i_afp;
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ ifp = &dp->i_af;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (xfs_attr_shortform_t *)tmpbuffer;
+ sf = (struct xfs_attr_shortform *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -918,17 +983,16 @@ xfs_attr_shortform_to_leaf(
nargs.valuelen = sfe->valuelen;
nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen);
- nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+ nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs);
ASSERT(error != -ENOSPC);
if (error)
goto out;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
- *leaf_bp = bp;
out:
kmem_free(tmpbuffer);
return error;
@@ -966,12 +1030,11 @@ xfs_attr_shortform_allfit(
return 0;
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return 0;
- bytes += sizeof(struct xfs_attr_sf_entry) - 1
- + name_loc->namelen
- + be16_to_cpu(name_loc->valuelen);
+ bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
}
- if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
- (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ if (xfs_has_attr2(dp->i_mount) &&
+ (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
return -1;
return xfs_attr_shortform_bytesfit(dp, bytes);
@@ -990,8 +1053,8 @@ xfs_attr_shortform_verify(
int i;
int64_t size;
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+ ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = ifp->if_bytes;
@@ -1010,6 +1073,8 @@ xfs_attr_shortform_verify(
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
* within the data buffer.
+ * xfs_attr_sf_entry is defined with a 1-byte variable
+ * array at the end, so we must subtract that off.
*/
if (((char *)sfep + sizeof(*sfep)) >= endp)
return __this_address;
@@ -1023,7 +1088,7 @@ xfs_attr_shortform_verify(
* within the data buffer. The next entry starts after the
* name component, so nextentry is an acceptable test.
*/
- next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep);
+ next_sfep = xfs_attr_sf_nextentry(sfep);
if ((char *)next_sfep > endp)
return __this_address;
@@ -1093,9 +1158,17 @@ xfs_attr3_leaf_to_shortform(
goto out;
if (forkoff == -1) {
- ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
- ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
- xfs_attr_fork_remove(dp, args->trans);
+ /*
+ * Don't remove the attr fork if this operation is the first
+ * part of a attr replace operations. We're going to add a new
+ * attr immediately, so we need to keep the attr fork around in
+ * this case.
+ */
+ if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ ASSERT(xfs_has_attr2(dp->i_mount));
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_remove(dp, args->trans);
+ }
goto out;
}
@@ -1124,7 +1197,7 @@ xfs_attr3_leaf_to_shortform(
nargs.value = &name_loc->nameval[nargs.namelen];
nargs.valuelen = be16_to_cpu(name_loc->valuelen);
nargs.hashval = be32_to_cpu(entry->hashval);
- nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+ nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK;
xfs_attr_shortform_add(&nargs, forkoff);
}
error = 0;
@@ -1155,6 +1228,11 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ error = -EIO;
+ goto out;
+ }
+
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
@@ -1170,9 +1248,9 @@ xfs_attr3_leaf_to_node(
xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
bp2->b_ops = bp1->b_ops;
memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
- hdr3->blkno = cpu_to_be64(bp2->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
}
xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
@@ -1235,12 +1313,12 @@ xfs_attr3_leaf_create(
memset(&ichdr, 0, sizeof(ichdr));
ichdr.firstused = args->geo->blksize;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
- hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
@@ -1449,10 +1527,12 @@ xfs_attr3_leaf_add_work(
entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
ichdr->freemap[mapindex].size);
entry->hashval = cpu_to_be32(args->hashval);
- entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
- entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
- if (args->op_flags & XFS_DA_OP_RENAME) {
- entry->flags |= XFS_ATTR_INCOMPLETE;
+ entry->flags = args->attr_filter;
+ if (tmp)
+ entry->flags |= XFS_ATTR_LOCAL;
+ if (args->op_flags & XFS_DA_OP_REPLACE) {
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
args->index2++;
@@ -2346,7 +2426,7 @@ xfs_attr3_leaf_lookup_int(
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
if (ichdr.count >= args->geo->blksize / 8) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -2365,11 +2445,11 @@ xfs_attr3_leaf_lookup_int(
break;
}
if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -2399,33 +2479,17 @@ xfs_attr3_leaf_lookup_int(
/*
* GROT: Add code to remove incomplete entries.
*/
- /*
- * If we are looking for INCOMPLETE entries, show only those.
- * If we are looking for complete entries, show only those.
- */
- if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) !=
- !!(entry->flags & XFS_ATTR_INCOMPLETE)) {
- continue;
- }
if (entry->flags & XFS_ATTR_LOCAL) {
name_loc = xfs_attr3_leaf_name_local(leaf, probe);
- if (name_loc->namelen != args->namelen)
- continue;
- if (memcmp(args->name, name_loc->nameval,
- args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ if (!xfs_attr_match(args, name_loc->namelen,
+ name_loc->nameval, entry->flags))
continue;
args->index = probe;
return -EEXIST;
} else {
name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
- if (name_rmt->namelen != args->namelen)
- continue;
- if (memcmp(args->name, name_rmt->name,
- args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ if (!xfs_attr_match(args, name_rmt->namelen,
+ name_rmt->name, entry->flags))
continue;
args->index = probe;
args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
@@ -2444,9 +2508,9 @@ xfs_attr3_leaf_lookup_int(
* Get the value associated with an attribute name from a leaf attribute
* list structure.
*
- * If ATTR_KERNOVAL is specified, only the length needs to be returned.
- * Unlike a lookup, we only return an error if the attribute does not
- * exist or we can't retrieve the value.
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
*/
int
xfs_attr3_leaf_getvalue(
@@ -2771,10 +2835,7 @@ xfs_attr3_leaf_clearflag(
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- /*
- * Commit the flag value change and start the next trans in series.
- */
- return xfs_trans_roll_inode(&args->trans, args->dp);
+ return 0;
}
/*
@@ -2822,10 +2883,7 @@ xfs_attr3_leaf_setflag(
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- /*
- * Commit the flag value change and start the next trans in series.
- */
- return xfs_trans_roll_inode(&args->trans, args->dp);
+ return 0;
}
/*
@@ -2940,10 +2998,5 @@ xfs_attr3_leaf_flipflags(
XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
}
- /*
- * Commit the flag value change and start the next trans in series.
- */
- error = xfs_trans_roll_inode(&args->trans, args->dp);
-
- return error;
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 73615b1dd1a8..368f4d9fa1d5 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
@@ -8,7 +8,6 @@
#define __XFS_ATTR_LEAF_H__
struct attrlist;
-struct attrlist_cursor_kern;
struct xfs_attr_list_context;
struct xfs_da_args;
struct xfs_da_state;
@@ -50,9 +49,11 @@ void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
- struct xfs_buf **leaf_bp);
-int xfs_attr_shortform_remove(struct xfs_da_args *args);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_attr_sf_removename(struct xfs_da_args *args);
+int xfs_attr_sf_findname(struct xfs_da_args *args,
+ struct xfs_attr_sf_entry **sfep,
+ unsigned int *basep);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 8b7f74b3bea2..d440393b40eb 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -51,7 +51,7 @@ xfs_attr3_rmt_blocks(
struct xfs_mount *mp,
int attrlen)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
return (attrlen + buflen - 1) / buflen;
}
@@ -96,8 +96,6 @@ xfs_attr3_rmt_verify(
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return __this_address;
if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
@@ -128,11 +126,11 @@ __xfs_attr3_rmt_read_verify(
int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return 0;
ptr = bp->b_addr;
- bno = bp->b_bn;
+ bno = xfs_buf_daddr(bp);
len = BBTOB(bp->b_length);
ASSERT(len >= blksize);
@@ -193,11 +191,11 @@ xfs_attr3_rmt_write_verify(
xfs_daddr_t bno;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
ptr = bp->b_addr;
- bno = bp->b_bn;
+ bno = xfs_buf_daddr(bp);
len = BBTOB(bp->b_length);
ASSERT(len >= blksize);
@@ -248,7 +246,7 @@ xfs_attr3_rmt_hdr_set(
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return 0;
rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
@@ -286,7 +284,7 @@ xfs_attr_rmtval_copyout(
uint8_t **dst)
{
char *src = bp->b_addr;
- xfs_daddr_t bno = bp->b_bn;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
int len = BBTOB(bp->b_length);
int blksize = mp->m_attr_geo->blksize;
@@ -298,7 +296,7 @@ xfs_attr_rmtval_copyout(
byte_cnt = min(*valuelen, byte_cnt);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (xfs_attr3_rmt_hdr_ok(src, ino, *offset,
byte_cnt, bno)) {
xfs_alert(mp,
@@ -334,7 +332,7 @@ xfs_attr_rmtval_copyin(
uint8_t **src)
{
char *dst = bp->b_addr;
- xfs_daddr_t bno = bp->b_bn;
+ xfs_daddr_t bno = xfs_buf_daddr(bp);
int len = BBTOB(bp->b_length);
int blksize = mp->m_attr_geo->blksize;
@@ -397,7 +395,7 @@ xfs_attr_rmtval_get(
trace_xfs_attr_rmtval_get(args);
- ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->valuelen != 0);
ASSERT(args->rmtvaluelen == args->valuelen);
valuelen = args->rmtvaluelen;
@@ -440,32 +438,23 @@ xfs_attr_rmtval_get(
}
/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
+ * Find a "hole" in the attribute address space large enough for us to drop the
+ * new attributes value into
*/
int
-xfs_attr_rmtval_set(
+xfs_attr_rmt_find_hole(
struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_bmbt_irec map;
- xfs_dablk_t lblkno;
- xfs_fileoff_t lfileoff = 0;
- uint8_t *src = args->value;
- int blkcnt;
- int valuelen;
- int nmap;
int error;
- int offset = 0;
-
- trace_xfs_attr_rmtval_set(args);
+ int blkcnt;
+ xfs_fileoff_t lfileoff = 0;
/*
- * Find a "hole" in the attribute address space large enough for
- * us to drop the new attribute's value into. Because CRC enable
- * attributes have headers, we can't just do a straight byte to FSB
- * conversion and have to take the header space into account.
+ * Because CRC enable attributes have headers, we can't just do a
+ * straight byte to FSB conversion and have to take the header space
+ * into account.
*/
blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
@@ -473,48 +462,26 @@ xfs_attr_rmtval_set(
if (error)
return error;
- args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkno = (xfs_dablk_t)lfileoff;
args->rmtblkcnt = blkcnt;
- /*
- * Roll through the "value", allocating blocks on disk as required.
- */
- while (blkcnt > 0) {
- /*
- * Allocate a single extent, up to the size of the value.
- *
- * Note that we have to consider this a data allocation as we
- * write the remote attribute without logging the contents.
- * Hence we must ensure that we aren't using blocks that are on
- * the busy list so that we don't overwrite blocks which have
- * recently been freed but their transactions are not yet
- * committed to disk. If we overwrite the contents of a busy
- * extent and then crash then the block may not contain the
- * correct metadata after log recovery occurs.
- */
- nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
- blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map,
- &nmap);
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
-
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
- lblkno += map.br_blockcount;
- blkcnt -= map.br_blockcount;
+ return 0;
+}
- /*
- * Start the next trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- return error;
- }
+int
+xfs_attr_rmtval_set_value(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ uint8_t *src = args->value;
+ int blkcnt;
+ int valuelen;
+ int nmap;
+ int error;
+ int offset = 0;
/*
* Roll through the "value", copying the attribute value to the
@@ -576,6 +543,7 @@ xfs_attr_rmtval_stale(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buf *bp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -583,14 +551,82 @@ xfs_attr_rmtval_stale(
XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
return -EFSCORRUPTED;
- bp = xfs_buf_incore(mp->m_ddev_targp,
+ error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
- XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
+ XFS_FSB_TO_BB(mp, map->br_blockcount),
+ incore_flags, &bp);
+ if (error) {
+ if (error == -ENOENT)
+ return 0;
+ return error;
}
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+/*
+ * Find a hole for the attr and store it in the delayed attr context. This
+ * initializes the context to roll through allocating an attr extent for a
+ * delayed attr operation
+ */
+int
+xfs_attr_rmtval_find_space(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
+ int error;
+
+ attr->xattri_lblkno = 0;
+ attr->xattri_blkcnt = 0;
+ args->rmtblkcnt = 0;
+ args->rmtblkno = 0;
+ memset(map, 0, sizeof(struct xfs_bmbt_irec));
+
+ error = xfs_attr_rmt_find_hole(args);
+ if (error)
+ return error;
+
+ attr->xattri_blkcnt = args->rmtblkcnt;
+ attr->xattri_lblkno = args->rmtblkno;
+
+ return 0;
+}
+
+/*
+ * Write one block of the value associated with an attribute into the
+ * out-of-line buffer that we have defined for it. This is similar to a subset
+ * of xfs_attr_rmtval_set, but records the current block to the delayed attr
+ * context, and leaves transaction handling to the caller.
+ */
+int
+xfs_attr_rmtval_set_blk(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
+ int nmap;
+ int error;
+
+ nmap = 1;
+ error = xfs_bmapi_write(args->trans, dp,
+ (xfs_fileoff_t)attr->xattri_lblkno,
+ attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ map, &nmap);
+ if (error)
+ return error;
+
+ ASSERT(nmap == 1);
+ ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
+ (map->br_startblock != HOLESTARTBLOCK));
+
+ /* roll attribute extent map forwards */
+ attr->xattri_lblkno += map->br_blockcount;
+ attr->xattri_blkcnt -= map->br_blockcount;
+
return 0;
}
@@ -599,15 +635,12 @@ xfs_attr_rmtval_stale(
* out-of-line buffer that it is stored on.
*/
int
-xfs_attr_rmtval_remove(
+xfs_attr_rmtval_invalidate(
struct xfs_da_args *args)
{
xfs_dablk_t lblkno;
int blkcnt;
int error;
- int done;
-
- trace_xfs_attr_rmtval_remove(args);
/*
* Roll through the "value", invalidating the attribute value's blocks.
@@ -635,28 +668,45 @@ xfs_attr_rmtval_remove(
lblkno += map.br_blockcount;
blkcnt -= map.br_blockcount;
}
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the out-of-line
+ * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the
+ * transaction and re-call the function. Callers should keep calling this
+ * routine until it returns something other than -EAGAIN.
+ */
+int
+xfs_attr_rmtval_remove(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error, done;
/*
- * Keep de-allocating extents until the remote-value region is gone.
+ * Unmap value blocks for this attr.
*/
- lblkno = args->rmtblkno;
- blkcnt = args->rmtblkcnt;
- done = 0;
- while (!done) {
- error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK, 1, &done);
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
+ error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno,
+ args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done);
+ if (error)
+ return error;
- /*
- * Close out trans and start the next one in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, args->dp);
- if (error)
- return error;
+ /*
+ * We don't need an explicit state here to pick up where we left off. We
+ * can figure it out using the !done return code. The actual value of
+ * attr->xattri_dela_state may be some value reminiscent of the calling
+ * function, but it's value is irrelevant with in the context of this
+ * function. Once we are done here, the next state is set as needed by
+ * the parent
+ */
+ if (!done) {
+ trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+ args->dp);
+ return -EAGAIN;
}
+
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 6fb4572845ce..d097ec6c4dc3 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
@@ -9,9 +9,12 @@
int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
-int xfs_attr_rmtval_set(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
-
+int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr);
+int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index aafa4fe70624..37578b369d9b 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -13,7 +13,6 @@
* to fit into the literal area of the inode.
*/
typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
/*
* We generate this then sort it, attr_list() must return things in hash-order.
@@ -27,16 +26,26 @@ typedef struct xfs_attr_sf_sort {
unsigned char *name; /* name value, pointer into buffer */
} xfs_attr_sf_sort_t;
-#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
- (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
-#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
- ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
-#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
- ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
-#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
- (be16_to_cpu(((xfs_attr_shortform_t *) \
- ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+/* space name/value uses */
+static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen)
+{
+ return sizeof(struct xfs_attr_sf_entry) + nlen + vlen;
+}
+
+/* space an entry uses */
+static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
+{
+ return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
+}
+
+/* next entry in struct */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_attr_sf_entsize(sfep);
+}
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index 99017b8df292..a04f266ae644 100644
--- a/fs/xfs/libxfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9a6d7a84689a..49d0d4ea63fc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -31,13 +31,13 @@
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
-
-kmem_zone_t *xfs_bmap_free_item_zone;
+struct kmem_cache *xfs_bmap_intent_cache;
/*
* Miscellaneous helper functions
@@ -52,46 +52,54 @@ xfs_bmap_compute_maxlevels(
xfs_mount_t *mp, /* file system mount structure */
int whichfork) /* data or attr fork */
{
+ uint64_t maxblocks; /* max blocks at this level */
+ xfs_extnum_t maxleafents; /* max leaf entries possible */
int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
int maxrootrecs; /* max records in root block */
int minleafrecs; /* min records in leaf block */
int minnoderecs; /* min records in node block */
int sz; /* root block size */
/*
- * The maximum number of extents in a file, hence the maximum
- * number of leaf entries, is controlled by the type of di_nextents
- * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
- * (a signed 16-bit number, xfs_aextnum_t).
+ * The maximum number of extents in a fork, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count.
*
- * Note that we can no longer assume that if we are in ATTR1 that
- * the fork offset of all the inodes will be
- * (xfs_default_attroffset(ip) >> 3) because we could have mounted
- * with ATTR2 and then mounted back with ATTR1, keeping the
- * di_forkoff's fixed but probably at various positions. Therefore,
- * for both ATTR1 and ATTR2 we have to assume the worst case scenario
- * of a minimum size available.
+ * Note that we can no longer assume that if we are in ATTR1 that the
+ * fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted with
+ * ATTR2 and then mounted back with ATTR1, keeping the i_forkoff's fixed
+ * but probably at various positions. Therefore, for both ATTR1 and
+ * ATTR2 we have to assume the worst case scenario of a minimum size
+ * available.
*/
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
+ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (whichfork == XFS_DATA_FORK)
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
+ else
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
+
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ maxblocks = howmany_64(maxleafents, minleafrecs);
for (level = 1; maxblocks > 1; level++) {
if (maxblocks <= maxrootrecs)
maxblocks = 1;
else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ maxblocks = howmany_64(maxblocks, minnoderecs);
}
mp->m_bm_maxlevels[whichfork] = level;
+ ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
+}
+
+unsigned int
+xfs_bmap_compute_attr_offset(
+ struct xfs_mount *mp)
+{
+ if (mp->m_sb.sb_inodesize == 256)
+ return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
}
STATIC int /* error */
@@ -120,10 +128,11 @@ xfs_bmbt_lookup_first(
*/
static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
return whichfork != XFS_COW_FORK &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >
- XFS_IFORK_MAXEXT(ip, whichfork);
+ ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork);
}
/*
@@ -131,10 +140,11 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
*/
static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
return whichfork != XFS_COW_FORK &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <=
- XFS_IFORK_MAXEXT(ip, whichfork);
+ ifp->if_format == XFS_DINODE_FMT_BTREE &&
+ ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork);
}
/*
@@ -190,24 +200,15 @@ uint
xfs_default_attroffset(
struct xfs_inode *ip)
{
- struct xfs_mount *mp = ip->i_mount;
- uint offset;
-
- if (mp->m_sb.sb_inodesize == 256) {
- offset = XFS_LITINO(mp, ip->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- } else {
- offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- }
-
- ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
- return offset;
+ if (ip->i_df.if_format == XFS_DINODE_FMT_DEV)
+ return roundup(sizeof(xfs_dev_t), 8);
+ return M_IGEO(ip->i_mount)->attr_fork_offset;
}
/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
+ * Helper routine to reset inode i_forkoff field when switching attribute fork
+ * from local to extent format - we reset it where possible to make space
+ * available for inline data fork extents.
*/
STATIC void
xfs_bmap_forkoff_reset(
@@ -215,12 +216,12 @@ xfs_bmap_forkoff_reset(
int whichfork)
{
if (whichfork == XFS_ATTR_FORK &&
- ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ ip->i_df.if_format != XFS_DINODE_FMT_DEV &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
- if (dfl_forkoff > ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = dfl_forkoff;
+ if (dfl_forkoff > ip->i_forkoff)
+ ip->i_forkoff = dfl_forkoff;
}
}
@@ -236,11 +237,11 @@ xfs_bmap_get_bp(
if (!cur)
return NULL;
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
- if (!cur->bc_bufs[i])
+ for (i = 0; i < cur->bc_maxlevels; i++) {
+ if (!cur->bc_levels[i].bp)
break;
- if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
- return cur->bc_bufs[i];
+ if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno)
+ return cur->bc_levels[i].bp;
}
/* Chase down all the log items to see if the bp is there */
@@ -248,7 +249,7 @@ xfs_bmap_get_bp(
struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip;
if (bip->bli_item.li_type == XFS_LI_BUF &&
- XFS_BUF_ADDR(bip->bli_buf) == bno)
+ xfs_buf_daddr(bip->bli_buf) == bno)
return bip->bli_buf;
}
@@ -293,7 +294,7 @@ xfs_check_block(
else
thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
if (*thispa == *pp) {
- xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
__func__, j, i,
(unsigned long long)be64_to_cpu(*thispa));
xfs_err(mp, "%s: ptrs are equal in node\n",
@@ -313,35 +314,32 @@ xfs_check_block(
*/
STATIC void
xfs_bmap_check_leaf_extents(
- xfs_btree_cur_t *cur, /* btree cursor or null */
+ struct xfs_btree_cur *cur, /* btree cursor or null */
xfs_inode_t *ip, /* incore inode pointer */
int whichfork) /* data or attr fork */
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
+ struct xfs_buf *bp; /* buffer for "block" */
int error; /* error return value */
xfs_extnum_t i=0, j; /* index into the extents list */
- struct xfs_ifork *ifp; /* fork structure */
int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
__be64 *pp; /* pointer to block address */
xfs_bmbt_rec_t *ep; /* pointer to current extent */
xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
xfs_bmbt_rec_t *nextp; /* pointer to next extent */
int bp_release = 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE)
return;
- }
/* skip large extent count inodes */
- if (ip->i_d.di_nextents > 10000)
+ if (ip->i_df.if_nextents > 10000)
return;
bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
@@ -468,7 +466,7 @@ error0:
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
__func__, i);
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -485,7 +483,7 @@ STATIC void
xfs_bmap_validate_ret(
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_bmbt_irec_t *mval,
int nmap,
int ret_nmap)
@@ -522,55 +520,6 @@ xfs_bmap_validate_ret(
#endif /* DEBUG */
/*
- * bmap free list manipulation functions
- */
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-void
-__xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
-{
- struct xfs_extent_free_item *new; /* new element */
-#ifdef DEBUG
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
-
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
- ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_bmap_free_item_zone != NULL);
-
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
- if (oinfo)
- new->xefi_oinfo = *oinfo;
- else
- new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
- new->xefi_skip_discard = skip_discard;
- trace_xfs_bmap_free_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
-}
-
-/*
* Inode fork format manipulation functions
*/
@@ -589,12 +538,12 @@ xfs_bmap_btree_to_extents(
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
- xfs_buf_t *cbp; /* child block's buffer */
+ struct xfs_buf *cbp; /* child block's buffer */
int error; /* error return value */
__be64 *pp; /* ptr to block address */
struct xfs_owner_info oinfo;
@@ -605,8 +554,7 @@ xfs_bmap_btree_to_extents(
ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
@@ -625,16 +573,15 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
- xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo);
- ip->i_d.di_nblocks--;
+ xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
+ ip->i_nblocks--;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
xfs_trans_binval(tp, cbp);
- if (cur->bc_bufs[0] == cbp)
- cur->bc_bufs[0] = NULL;
+ if (cur->bc_levels[0].bp == cbp)
+ cur->bc_levels[0].bp = NULL;
xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
- ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -669,15 +616,14 @@ xfs_bmap_extents_to_btree(
mp = ip->i_mount;
ASSERT(whichfork != XFS_COW_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
/*
* Make space in the inode incore. This needs to be undone if we fail
* to expand the root.
*/
xfs_iroot_realloc(ip, 1, whichfork);
- ifp->if_flags |= XFS_IFBROOT;
/*
* Fill in the root.
@@ -690,11 +636,11 @@ xfs_bmap_extents_to_btree(
* Need a cursor. Can't allocate until bb_level is filled in.
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
/*
* Convert to a btree with two levels, one record in root.
*/
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ ifp->if_format = XFS_DINODE_FMT_BTREE;
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = mp;
@@ -727,8 +673,8 @@ xfs_bmap_extents_to_btree(
ASSERT(tp->t_firstblock == NULLFSBLOCK ||
args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock));
tp->t_firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- ip->i_d.di_nblocks++;
+ cur->bc_ino.allocated++;
+ ip->i_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, args.fsbno),
@@ -741,7 +687,7 @@ xfs_bmap_extents_to_btree(
*/
abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
- xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+ xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
XFS_BTREE_LONG_PTRS);
@@ -752,7 +698,7 @@ xfs_bmap_extents_to_btree(
xfs_bmbt_disk_set_all(arp, &rec);
cnt++;
}
- ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+ ASSERT(cnt == ifp->if_nextents);
xfs_btree_set_numrecs(ablock, cnt);
/*
@@ -780,7 +726,7 @@ out_unreserve_dquot:
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
out_root_realloc:
xfs_iroot_realloc(ip, -1, whichfork);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
ASSERT(ifp->if_broot == NULL);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -799,19 +745,17 @@ xfs_bmap_local_to_extents_empty(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(whichfork != XFS_COW_FORK);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(ifp->if_bytes == 0);
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+ ASSERT(ifp->if_nextents == 0);
xfs_bmap_forkoff_reset(ip, whichfork);
- ifp->if_flags &= ~XFS_IFINLINE;
- ifp->if_flags |= XFS_IFEXTENTS;
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -832,7 +776,7 @@ xfs_bmap_local_to_extents(
int flags; /* logging flags returned */
struct xfs_ifork *ifp; /* inode fork pointer */
xfs_alloc_arg_t args; /* allocation arguments */
- xfs_buf_t *bp; /* buffer for extent block */
+ struct xfs_buf *bp; /* buffer for extent block */
struct xfs_bmbt_irec rec;
struct xfs_iext_cursor icur;
@@ -841,8 +785,8 @@ xfs_bmap_local_to_extents(
* So sending the data fork of a regular inode is invalid.
*/
ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
@@ -852,7 +796,6 @@ xfs_bmap_local_to_extents(
flags = 0;
error = 0;
- ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE);
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
@@ -909,8 +852,8 @@ xfs_bmap_local_to_extents(
xfs_iext_first(ifp, &icur);
xfs_iext_insert(ip, &icur, &rec, 0);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
- ip->i_d.di_nblocks = 1;
+ ifp->if_nextents = 1;
+ ip->i_nblocks = 1;
xfs_trans_mod_dquot_byino(tp, ip,
XFS_TRANS_DQ_BCOUNT, 1L);
flags |= xfs_ilog_fext(whichfork);
@@ -929,13 +872,15 @@ xfs_bmap_add_attrfork_btree(
xfs_inode_t *ip, /* incore inode pointer */
int *flags) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_block *block = ip->i_df.if_broot;
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
xfs_mount_t *mp; /* file system mount struct */
int stat; /* newroot status */
mp = ip->i_mount;
- if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+
+ if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
*flags |= XFS_ILOG_DBROOT;
else {
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -953,7 +898,7 @@ xfs_bmap_add_attrfork_btree(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return -ENOSPC;
}
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
}
return 0;
@@ -971,16 +916,17 @@ xfs_bmap_add_attrfork_extents(
struct xfs_inode *ip, /* incore inode pointer */
int *flags) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* bmap btree cursor */
+ struct xfs_btree_cur *cur; /* bmap btree cursor */
int error; /* error return value */
- if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
+ xfs_inode_data_fork_size(ip))
return 0;
cur = NULL;
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
XFS_DATA_FORK);
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -1005,7 +951,7 @@ xfs_bmap_add_attrfork_local(
{
struct xfs_da_args dargs; /* args for dir/attr code */
- if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+ if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip))
return 0;
if (S_ISDIR(VFS_I(ip)->i_mode)) {
@@ -1028,24 +974,28 @@ xfs_bmap_add_attrfork_local(
return -EFSCORRUPTED;
}
-/* Set an inode attr fork off based on the format */
-int
+/*
+ * Set an inode attr fork offset based on the format of the data fork.
+ */
+static int
xfs_bmap_set_attrforkoff(
struct xfs_inode *ip,
int size,
int *version)
{
- switch (ip->i_d.di_format) {
+ int default_size = xfs_default_attroffset(ip) >> 3;
+
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ ip->i_forkoff = default_size;
break;
case XFS_DINODE_FMT_LOCAL:
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
- else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+ ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_forkoff)
+ ip->i_forkoff = default_size;
+ else if (xfs_has_attr2(ip->i_mount) && version)
*version = 2;
break;
default:
@@ -1073,48 +1023,28 @@ xfs_bmap_add_attrfork(
int logflags; /* logging flags */
int error; /* error return value */
- ASSERT(XFS_IFORK_Q(ip) == 0);
+ ASSERT(xfs_inode_has_attr_fork(ip) == 0);
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
blks = XFS_ADDAFORK_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd, &tp);
if (error)
return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto trans_cancel;
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
goto trans_cancel;
- if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) {
- error = -EFSCORRUPTED;
- goto trans_cancel;
- }
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
- /*
- * For inodes coming from pre-6.2 filesystems.
- */
- ASSERT(ip->i_d.di_aformat == 0);
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- }
- xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
goto trans_cancel;
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
- ip->i_afp->if_flags = XFS_IFEXTENTS;
+
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
logflags = 0;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_LOCAL:
error = xfs_bmap_add_attrfork_local(tp, ip, &logflags);
break;
@@ -1132,17 +1062,17 @@ xfs_bmap_add_attrfork(
xfs_trans_log_inode(tp, ip, logflags);
if (error)
goto trans_cancel;
- if (!xfs_sb_version_hasattr(&mp->m_sb) ||
- (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+ if (!xfs_has_attr(mp) ||
+ (!xfs_has_attr2(mp) && version == 2)) {
bool log_sb = false;
spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasattr(&mp->m_sb)) {
- xfs_sb_version_addattr(&mp->m_sb);
+ if (!xfs_has_attr(mp)) {
+ xfs_add_attr(mp);
log_sb = true;
}
- if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
- xfs_sb_version_addattr2(&mp->m_sb);
+ if (!xfs_has_attr2(mp) && version == 2) {
+ xfs_add_attr2(mp);
log_sb = true;
}
spin_unlock(&mp->m_sb_lock);
@@ -1178,20 +1108,20 @@ xfs_iread_bmbt_block(
{
struct xfs_iread_state *ir = priv;
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_btree_block *block;
struct xfs_buf *bp;
struct xfs_bmbt_rec *frp;
xfs_extnum_t num_recs;
xfs_extnum_t j;
- int whichfork = cur->bc_private.b.whichfork;
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
block = xfs_btree_get_block(cur, level, &bp);
/* Abort if we find more records than nextents. */
num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(ir->loaded + num_recs >
- XFS_IFORK_NEXTENTS(ip, whichfork))) {
+ if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) {
xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).",
(unsigned long long)ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
@@ -1217,7 +1147,7 @@ xfs_iread_bmbt_block(
xfs_bmap_fork_to_state(whichfork));
trace_xfs_read_extent(ip, &ir->icur,
xfs_bmap_fork_to_state(whichfork), _THIS_IP_);
- xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur);
+ xfs_iext_next(ifp, &ir->icur);
}
return 0;
@@ -1233,19 +1163,15 @@ xfs_iread_extents(
int whichfork)
{
struct xfs_iread_state ir;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_cur *cur;
int error;
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!xfs_need_iread_extents(ifp))
+ return 0;
- if (XFS_IS_CORRUPT(mp,
- XFS_IFORK_FORMAT(ip, whichfork) !=
- XFS_DINODE_FMT_BTREE)) {
- error = -EFSCORRUPTED;
- goto out;
- }
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ir.loaded = 0;
xfs_iext_first(ifp, &ir.icur);
@@ -1256,14 +1182,11 @@ xfs_iread_extents(
if (error)
goto out;
- if (XFS_IS_CORRUPT(mp,
- ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) {
+ if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
error = -EFSCORRUPTED;
goto out;
}
ASSERT(ir.loaded == xfs_iext_count(ifp));
-
- ifp->if_flags |= XFS_IFEXTENTS;
return 0;
out:
xfs_iext_destroy(ifp);
@@ -1284,26 +1207,23 @@ xfs_bmap_first_unused(
xfs_fileoff_t *first_unused, /* unused block */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
xfs_fileoff_t lastaddr = 0;
xfs_fileoff_t lowest, max;
int error;
- ASSERT(xfs_ifork_has_extents(ip, whichfork) ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
*first_unused = 0;
return 0;
}
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ ASSERT(xfs_ifork_has_extents(ifp));
+
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
lowest = max = *first_unused;
for_each_xfs_iext(ifp, &icur, &got) {
@@ -1334,12 +1254,12 @@ xfs_bmap_last_before(
xfs_fileoff_t *last_block, /* last block */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
int error;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
*last_block = 0;
return 0;
@@ -1351,11 +1271,9 @@ xfs_bmap_last_before(
return -EFSCORRUPTED;
}
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got))
*last_block = 0;
@@ -1370,15 +1288,13 @@ xfs_bmap_last_extent(
struct xfs_bmbt_irec *rec,
int *is_empty)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_iext_cursor icur;
int error;
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
xfs_iext_last(ifp, &icur);
if (!xfs_iext_get_extent(ifp, &icur, rec))
@@ -1438,16 +1354,17 @@ xfs_bmap_last_offset(
xfs_fileoff_t *last_block,
int whichfork)
{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec rec;
int is_empty;
int error;
*last_block = 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
return 0;
- if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork)))
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
return -EFSCORRUPTED;
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
@@ -1459,39 +1376,6 @@ xfs_bmap_last_offset(
}
/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not. For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int /* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
-{
- struct xfs_ifork *ifp; /* inode fork pointer */
- int rval; /* return value */
- xfs_bmbt_irec_t s; /* internal version of extent */
- struct xfs_iext_cursor icur;
-
-#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK)
- return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-#endif /* !DEBUG */
- if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
- return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- xfs_iext_first(ifp, &icur);
- xfs_iext_get_extent(ifp, &icur, &s);
- rval = s.br_startoff == 0 && s.br_blockcount == 1;
- if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
- return rval;
-}
-
-/*
* Extent tree manipulation functions used during allocation.
*/
@@ -1503,32 +1387,26 @@ xfs_bmap_add_extent_delay_real(
struct xfs_bmalloca *bma,
int whichfork)
{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
struct xfs_bmbt_irec *new = &bma->got;
int error; /* error return value */
int i; /* temp state */
- struct xfs_ifork *ifp; /* inode fork pointer */
xfs_fileoff_t new_endoff; /* end offset of new entry */
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
int tmp_rval; /* partial logging flags */
- struct xfs_mount *mp;
- xfs_extnum_t *nextents;
struct xfs_bmbt_irec old;
- mp = bma->ip->i_mount;
- ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(whichfork != XFS_ATTR_FORK);
- nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
- &bma->ip->i_d.di_nextents);
-
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
- (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -1571,7 +1449,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -1589,13 +1467,13 @@ xfs_bmap_add_extent_delay_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -1616,7 +1494,7 @@ xfs_bmap_add_extent_delay_real(
xfs_iext_remove(bma->ip, &bma->icur, state);
xfs_iext_prev(ifp, &bma->icur);
xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
- (*nextents)--;
+ ifp->if_nextents--;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1720,8 +1598,8 @@ xfs_bmap_add_extent_delay_real(
PREV.br_startblock = new->br_startblock;
PREV.br_state = new->br_state;
xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ ifp->if_nextents++;
- (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1786,7 +1664,8 @@ xfs_bmap_add_extent_delay_real(
* The left neighbor is not contiguous.
*/
xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
- (*nextents)++;
+ ifp->if_nextents++;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1818,7 +1697,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
PREV.br_startoff = new_endoff;
PREV.br_blockcount = temp;
@@ -1872,7 +1751,8 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is not contiguous.
*/
xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
- (*nextents)++;
+ ifp->if_nextents++;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1904,7 +1784,7 @@ xfs_bmap_add_extent_delay_real(
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
PREV.br_startblock = nullstartblock(da_new);
PREV.br_blockcount = temp;
@@ -1957,7 +1837,7 @@ xfs_bmap_add_extent_delay_real(
xfs_iext_next(ifp, &bma->icur);
xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state);
xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state);
- (*nextents)++;
+ ifp->if_nextents++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -2025,8 +1905,8 @@ xfs_bmap_add_extent_delay_real(
xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
if (bma->cur) {
- da_new += bma->cur->bc_private.b.allocated;
- bma->cur->bc_private.b.allocated = 0;
+ da_new += bma->cur->bc_ino.allocated;
+ bma->cur->bc_ino.allocated = 0;
}
/* adjust for changes in reserved delayed indirect blocks */
@@ -2055,11 +1935,11 @@ xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ struct xfs_btree_cur **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp) /* inode logging flags */
{
- xfs_btree_cur_t *cur; /* btree cursor */
+ struct xfs_btree_cur *cur; /* btree cursor */
int error; /* error return value */
int i; /* temp state */
struct xfs_ifork *ifp; /* inode fork pointer */
@@ -2067,14 +1947,14 @@ xfs_bmap_add_extent_unwritten_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec old;
*logflagsp = 0;
cur = *curp;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2117,7 +1997,7 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -2135,13 +2015,13 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2161,8 +2041,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &LEFT);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
+ ifp->if_nextents -= 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2214,8 +2093,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &LEFT);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ ifp->if_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2257,9 +2135,8 @@ xfs_bmap_add_extent_unwritten_real(
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &PREV);
+ ifp->if_nextents--;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2366,8 +2243,8 @@ xfs_bmap_add_extent_unwritten_real(
xfs_iext_update_extent(ip, state, icur, &PREV);
xfs_iext_insert(ip, icur, new, state);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ ifp->if_nextents++;
+
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2442,9 +2319,8 @@ xfs_bmap_add_extent_unwritten_real(
xfs_iext_update_extent(ip, state, icur, &PREV);
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, new, state);
+ ifp->if_nextents++;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2495,9 +2371,8 @@ xfs_bmap_add_extent_unwritten_real(
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, &r[1], state);
xfs_iext_insert(ip, icur, &r[0], state);
+ ifp->if_nextents += 2;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2573,7 +2448,7 @@ xfs_bmap_add_extent_unwritten_real(
/* clear out the allocated field, done with it now in any case. */
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
*curp = cur;
}
@@ -2601,10 +2476,10 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
@@ -2632,15 +2507,15 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN)))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2738,9 +2613,9 @@ xfs_bmap_add_extent_hole_real(
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
int *logflagsp,
- int flags)
+ uint32_t flags)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_cur *cur = *curp;
int error; /* error return value */
@@ -2748,11 +2623,11 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2783,17 +2658,17 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_startblock + new->br_blockcount == right.br_startblock &&
new->br_state == right.br_state &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2812,9 +2687,8 @@ xfs_bmap_add_extent_hole_real(
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
+ ifp->if_nextents--;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
@@ -2912,8 +2786,8 @@ xfs_bmap_add_extent_hole_real(
* Insert a new entry.
*/
xfs_iext_insert(ip, icur, new, state);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ ifp->if_nextents++;
+
if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
@@ -2955,7 +2829,7 @@ xfs_bmap_add_extent_hole_real(
/* clear out the allocated field, done with it now in any case. */
if (cur)
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
@@ -2968,7 +2842,7 @@ done:
*/
/*
- * Adjust the size of the new extent based on di_extsize and rt extsize.
+ * Adjust the size of the new extent based on i_extsize and rt extsize.
*/
int
xfs_bmap_extsize_align(
@@ -3029,15 +2903,15 @@ xfs_bmap_extsize_align(
/*
* For large extent hint sizes, the aligned extent might be larger than
- * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
- * the length back under MAXEXTLEN. The outer allocation loops handle
- * short allocation just fine, so it is safe to do this. We only want to
- * do it when we are forced to, though, because it means more allocation
- * operations are required.
+ * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+ * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+ * allocation loops handle short allocation just fine, so it is safe to
+ * do this. We only want to do it when we are forced to, though, because
+ * it means more allocation operations are required.
*/
- while (align_alen > MAXEXTLEN)
+ while (align_alen > XFS_MAX_BMBT_EXTLEN)
align_alen -= extsz;
- ASSERT(align_alen <= MAXEXTLEN);
+ ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
/*
* If the previous block overlaps with this proposed allocation
@@ -3127,9 +3001,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
- /* see MAXEXTLEN handling above */
+ /* see XFS_BMBT_MAX_EXTLEN handling above */
ASSERT(orig_end <= align_off + align_alen ||
- align_alen + extsz > MAXEXTLEN);
+ align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
}
#ifdef DEBUG
@@ -3310,7 +3184,8 @@ xfs_bmap_longest_free_extent(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
+ NULL);
if (error) {
/* Couldn't lock the AGF, so skip this AG. */
if (error == -EAGAIN) {
@@ -3474,7 +3349,7 @@ xfs_bmap_btalloc_accounting(
}
/* data/attr fork only */
- ap->ip->i_d.di_nblocks += args->len;
+ ap->ip->i_nblocks += args->len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel) {
ap->ip->i_delayed_blks -= args->len;
@@ -3485,35 +3360,17 @@ xfs_bmap_btalloc_accounting(
args->len);
}
-STATIC int
-xfs_bmap_btalloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+static int
+xfs_bmap_compute_alignments(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args)
{
- xfs_mount_t *mp; /* mount point structure */
- xfs_alloctype_t atype = 0; /* type for allocation routines */
- xfs_extlen_t align = 0; /* minimum allocation alignment */
- xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
- xfs_agnumber_t ag;
- xfs_alloc_arg_t args;
- xfs_fileoff_t orig_offset;
- xfs_extlen_t orig_length;
- xfs_extlen_t blen;
- xfs_extlen_t nextminlen = 0;
- int nullfb; /* true if ap->firstblock isn't set */
- int isaligned;
- int tryagain;
- int error;
- int stripe_align;
-
- ASSERT(ap->length);
- orig_offset = ap->offset;
- orig_length = ap->length;
-
- mp = ap->ip->i_mount;
+ struct xfs_mount *mp = args->mp;
+ xfs_extlen_t align = 0; /* minimum allocation alignment */
+ int stripe_align = 0;
/* stripe alignment for allocation is determined by mount parameters */
- stripe_align = 0;
- if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ if (mp->m_swidth && xfs_has_swalloc(mp))
stripe_align = mp->m_swidth;
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
@@ -3523,13 +3380,172 @@ xfs_bmap_btalloc(
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
if (align) {
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 0, ap->eof, 0, ap->conv,
- &ap->offset, &ap->length);
- ASSERT(!error);
+ if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
+ ap->eof, 0, ap->conv, &ap->offset,
+ &ap->length))
+ ASSERT(0);
ASSERT(ap->length);
}
+ /* apply extent size hints if obtained earlier */
+ if (align) {
+ args->prod = align;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
+ args->prod = 1;
+ args->mod = 0;
+ } else {
+ args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ }
+
+ return stripe_align;
+}
+
+static void
+xfs_bmap_process_allocated_extent(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_fileoff_t orig_offset,
+ xfs_extlen_t orig_length)
+{
+ int nullfb;
+
+ nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+
+ /*
+ * check the allocation happened at the same or higher AG than
+ * the first block that was allocated.
+ */
+ ASSERT(nullfb ||
+ XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
+ XFS_FSB_TO_AGNO(args->mp, args->fsbno));
+
+ ap->blkno = args->fsbno;
+ if (nullfb)
+ ap->tp->t_firstblock = args->fsbno;
+ ap->length = args->len;
+ /*
+ * If the extent size hint is active, we tried to round the
+ * caller's allocation request offset down to extsz and the
+ * length up to another extsz boundary. If we found a free
+ * extent we mapped it in starting at this new offset. If the
+ * newly mapped space isn't long enough to cover any of the
+ * range of offsets that was originally requested, move the
+ * mapping up so that we can fill as much of the caller's
+ * original request as possible. Free space is apparently
+ * very fragmented so we're unlikely to be able to satisfy the
+ * hints anyway.
+ */
+ if (ap->length <= orig_length)
+ ap->offset = orig_offset;
+ else if (ap->offset + ap->length < orig_offset + orig_length)
+ ap->offset = orig_offset + orig_length - ap->length;
+ xfs_bmap_btalloc_accounting(ap, args);
+}
+
+#ifdef DEBUG
+static int
+xfs_bmap_exact_minlen_extent_alloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ int error;
+
+ ASSERT(ap->length);
+
+ if (ap->minlen != 1) {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ args.alloc_minlen_only = 1;
+
+ xfs_bmap_compute_alignments(ap, &args);
+
+ if (ap->tp->t_firstblock == NULLFSBLOCK) {
+ /*
+ * Unlike the longest extent available in an AG, we don't track
+ * the length of an AG's shortest extent.
+ * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
+ * hence we can afford to start traversing from the 0th AG since
+ * we need not be concerned about a drop in performance in
+ * "debug only" code paths.
+ */
+ ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
+ } else {
+ ap->blkno = ap->tp->t_firstblock;
+ }
+
+ args.fsbno = ap->blkno;
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.minlen = args.maxlen = ap->minlen;
+ args.total = ap->total;
+
+ args.alignment = 1;
+ args.minalignslop = 0;
+
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ if (args.fsbno != NULLFSBLOCK) {
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
+ } else {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ }
+
+ return 0;
+}
+#else
+
+#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED)
+
+#endif
+
+STATIC int
+xfs_bmap_btalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_alloctype_t atype = 0;
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_agnumber_t ag;
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ xfs_extlen_t blen;
+ xfs_extlen_t nextminlen = 0;
+ int nullfb; /* true if ap->firstblock isn't set */
+ int isaligned;
+ int tryagain;
+ int error;
+ int stripe_align;
+
+ ASSERT(ap->length);
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ stripe_align = xfs_bmap_compute_alignments(ap, &args);
nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
@@ -3560,9 +3576,6 @@ xfs_bmap_btalloc(
* Normal allocation, done through xfs_alloc_vextent.
*/
tryagain = isaligned = 0;
- memset(&args, 0, sizeof(args));
- args.tp = ap->tp;
- args.mp = mp;
args.fsbno = ap->blkno;
args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
@@ -3593,21 +3606,7 @@ xfs_bmap_btalloc(
args.total = ap->total;
args.minlen = ap->minlen;
}
- /* apply extent size hints if obtained earlier */
- if (align) {
- args.prod = align;
- div_u64_rem(ap->offset, args.prod, &args.mod);
- if (args.mod)
- args.mod = args.prod - args.mod;
- } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
- args.prod = 1;
- args.mod = 0;
- } else {
- args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
- div_u64_rem(ap->offset, args.prod, &args.mod);
- if (args.mod)
- args.mod = args.prod - args.mod;
- }
+
/*
* If we are not low on available data blocks, and the underlying
* logical volume manager is a stripe, and the file offset is zero then
@@ -3709,37 +3708,10 @@ xfs_bmap_btalloc(
return error;
ap->tp->t_flags |= XFS_TRANS_LOWMODE;
}
+
if (args.fsbno != NULLFSBLOCK) {
- /*
- * check the allocation happened at the same or higher AG than
- * the first block that was allocated.
- */
- ASSERT(ap->tp->t_firstblock == NULLFSBLOCK ||
- XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock) <=
- XFS_FSB_TO_AGNO(mp, args.fsbno));
-
- ap->blkno = args.fsbno;
- if (ap->tp->t_firstblock == NULLFSBLOCK)
- ap->tp->t_firstblock = args.fsbno;
- ASSERT(nullfb || fb_agno <= args.agno);
- ap->length = args.len;
- /*
- * If the extent size hint is active, we tried to round the
- * caller's allocation request offset down to extsz and the
- * length up to another extsz boundary. If we found a free
- * extent we mapped it in starting at this new offset. If the
- * newly mapped space isn't long enough to cover any of the
- * range of offsets that was originally requested, move the
- * mapping up so that we can fill as much of the caller's
- * original request as possible. Free space is apparently
- * very fragmented so we're unlikely to be able to satisfy the
- * hints anyway.
- */
- if (ap->length <= orig_length)
- ap->offset = orig_offset;
- else if (ap->offset + ap->length < orig_offset + orig_length)
- ap->offset = orig_offset + orig_length - ap->length;
- xfs_bmap_btalloc_accounting(ap, &args);
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
} else {
ap->blkno = NULLFSBLOCK;
ap->length = 0;
@@ -3792,7 +3764,7 @@ xfs_bmapi_trim_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int n,
- int flags)
+ uint32_t flags)
{
if ((flags & XFS_BMAPI_ENTIRE) ||
got->br_startoff + got->br_blockcount <= obno) {
@@ -3837,7 +3809,7 @@ xfs_bmapi_update_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int *n,
- int flags)
+ uint32_t flags)
{
xfs_bmbt_irec_t *mval = *map;
@@ -3890,10 +3862,11 @@ xfs_bmapi_read(
xfs_filblks_t len,
struct xfs_bmbt_irec *mval,
int *nmap,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
xfs_fileoff_t obno;
xfs_fileoff_t end;
@@ -3901,53 +3874,26 @@ xfs_bmapi_read(
int error;
bool eof = false;
int n = 0;
- int whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
- ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
- XFS_BMAPI_COWFORK)));
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ if (WARN_ON_ONCE(!ifp))
+ return -EFSCORRUPTED;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
return -EFSCORRUPTED;
- }
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
XFS_STATS_INC(mp, xs_blk_mapr);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!ifp) {
- /* No CoW fork? Return a hole. */
- if (whichfork == XFS_COW_FORK) {
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount = len;
- mval->br_state = XFS_EXT_NORM;
- *nmap = 1;
- return 0;
- }
-
- /*
- * A missing attr ifork implies that the inode says we're in
- * extents or btree format but failed to pass the inode fork
- * verifier while trying to load it. Treat that as a file
- * corruption too.
- */
-#ifdef DEBUG
- xfs_alert(mp, "%s: inode %llu missing fork %d",
- __func__, ip->i_ino, whichfork);
-#endif /* DEBUG */
- return -EFSCORRUPTED;
- }
-
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ return error;
if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got))
eof = true;
@@ -4013,7 +3959,7 @@ xfs_bmapi_reserve_delalloc(
int eof)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
int error;
@@ -4023,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
- alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
@@ -4047,8 +3993,7 @@ xfs_bmapi_reserve_delalloc(
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
- error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_reserve_blkres(ip, alen);
if (error)
return error;
@@ -4094,8 +4039,7 @@ out_unreserve_blocks:
xfs_mod_fdblocks(mp, alen, false);
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
- xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0,
- XFS_QMOPT_RES_REGBLKS);
+ xfs_quota_unreserve_blkres(ip, alen);
return error;
}
@@ -4129,6 +4073,10 @@ xfs_bmap_alloc_userdata(
return xfs_bmap_rtalloc(bma);
}
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ return xfs_bmap_exact_minlen_extent_alloc(bma);
+
return xfs_bmap_btalloc(bma);
}
@@ -4138,7 +4086,7 @@ xfs_bmapi_allocate(
{
struct xfs_mount *mp = bma->ip->i_mount;
int whichfork = xfs_bmapi_whichfork(bma->flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4154,7 +4102,7 @@ xfs_bmapi_allocate(
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
bma->prev.br_startoff = NULLFILEOFF;
} else {
- bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
if (!bma->eof)
bma->length = XFS_FILBLKS_MIN(bma->length,
bma->got.br_startoff - bma->offset);
@@ -4165,10 +4113,15 @@ xfs_bmapi_allocate(
else
bma->minlen = 1;
- if (bma->flags & XFS_BMAPI_METADATA)
- error = xfs_bmap_btalloc(bma);
- else
+ if (bma->flags & XFS_BMAPI_METADATA) {
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ error = xfs_bmap_exact_minlen_extent_alloc(bma);
+ else
+ error = xfs_bmap_btalloc(bma);
+ } else {
error = xfs_bmap_alloc_userdata(bma);
+ }
if (error || bma->blkno == NULLFSBLOCK)
return error;
@@ -4178,7 +4131,7 @@ xfs_bmapi_allocate(
return error;
}
- if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur)
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur)
bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
/*
* Bump the number of extents we've allocated
@@ -4187,25 +4140,15 @@ xfs_bmapi_allocate(
bma->nallocs++;
if (bma->cur)
- bma->cur->bc_private.b.flags =
- bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ bma->cur->bc_ino.flags =
+ bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
bma->got.br_blockcount = bma->length;
bma->got.br_state = XFS_EXT_NORM;
- /*
- * In the data fork, a wasdelay extent has been initialized, so
- * shouldn't be flagged as unwritten.
- *
- * For the cow fork, however, we convert delalloc reservations
- * (extents allocated for speculative preallocation) to
- * allocated unwritten extents, and only convert the unwritten
- * extents to real extents when we're about to write the data.
- */
- if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
- (bma->flags & XFS_BMAPI_PREALLOC))
+ if (bma->flags & XFS_BMAPI_PREALLOC)
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
@@ -4239,10 +4182,10 @@ xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
struct xfs_bmbt_irec *mval,
xfs_filblks_t len,
- int flags)
+ uint32_t flags)
{
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4261,7 +4204,7 @@ xfs_bmapi_convert_unwritten(
* Modify (by adding) the state flag, if writing.
*/
ASSERT(mval->br_blockcount <= len);
- if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE && !bma->cur) {
bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
bma->ip, whichfork);
}
@@ -4319,11 +4262,13 @@ xfs_bmapi_minleft(
struct xfs_inode *ip,
int fork)
{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork);
+
if (tp && tp->t_firstblock != NULLFSBLOCK)
return 0;
- if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE)
return 1;
- return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+ return be16_to_cpu(ifp->if_broot->bb_level) + 1;
}
/*
@@ -4338,11 +4283,13 @@ xfs_bmapi_finish(
int whichfork,
int error)
{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
+
if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ ifp->if_format != XFS_DINODE_FMT_EXTENTS)
bma->logflags &= ~xfs_ilog_fext(whichfork);
else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
bma->logflags &= ~xfs_ilog_fbroot(whichfork);
if (bma->logflags)
@@ -4363,7 +4310,7 @@ xfs_bmapi_write(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting file offs. mapped */
xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
+ uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
@@ -4374,13 +4321,13 @@ xfs_bmapi_write(
.total = total,
};
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
int n; /* current extent index */
xfs_fileoff_t obno; /* old block number (offset) */
- int whichfork; /* data or attr fork */
#ifdef DEBUG
xfs_fileoff_t orig_bno; /* original block number value */
@@ -4395,13 +4342,12 @@ xfs_bmapi_write(
orig_mval = mval;
orig_nmap = *nmap;
#endif
- whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(tp != NULL);
ASSERT(len > 0);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & XFS_BMAPI_REMAP));
@@ -4417,23 +4363,19 @@ xfs_bmapi_write(
ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
- ifp = XFS_IFORK_PTR(ip, whichfork);
-
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- goto error0;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ goto error0;
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
@@ -4480,8 +4422,8 @@ xfs_bmapi_write(
* xfs_extlen_t and therefore 32 bits. Hence we have to
* check for 32-bit overflows and handle them here.
*/
- if (len > (xfs_filblks_t)MAXEXTLEN)
- bma.length = MAXEXTLEN;
+ if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+ bma.length = XFS_MAX_BMBT_EXTLEN;
else
bma.length = len;
@@ -4536,9 +4478,8 @@ xfs_bmapi_write(
if (error)
goto error0;
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) >
- XFS_IFORK_MAXEXT(ip, whichfork));
+ ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE ||
+ ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
xfs_bmapi_finish(&bma, whichfork, 0);
xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
orig_nmap, *nmap);
@@ -4562,7 +4503,7 @@ xfs_bmapi_convert_delalloc(
struct iomap *iomap,
unsigned int *seq)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
@@ -4585,6 +4526,14 @@ xfs_bmapi_convert_delalloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, whichfork,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
bma.got.br_startoff > offset_fsb) {
/*
@@ -4602,7 +4551,7 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4611,10 +4560,26 @@ xfs_bmapi_convert_delalloc(
bma.ip = ip;
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+ XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+
+ /*
+ * When we're converting the delalloc reservations backing dirty pages
+ * in the page cache, we must be careful about how we create the new
+ * extents:
+ *
+ * New CoW fork extents are created unwritten, turned into real extents
+ * when we're about to write the data to disk, and mapped into the data
+ * fork after the write finishes. End of story.
+ *
+ * New data fork extents must be mapped in as unwritten and converted
+ * to real extents after the write succeeds to avoid exposing stale
+ * disk contents if we crash.
+ */
+ bma.flags = XFS_BMAPI_PREALLOC;
if (whichfork == XFS_COW_FORK)
- bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+ bma.flags |= XFS_BMAPI_COWFORK;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
@@ -4634,7 +4599,7 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
@@ -4665,7 +4630,7 @@ xfs_bmapi_remap(
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4675,28 +4640,26 @@ xfs_bmapi_remap(
int whichfork = xfs_bmapi_whichfork(flags);
int logflags = 0, error;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(len > 0);
- ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
/* make sure we only reflink into a hole. */
@@ -4704,12 +4667,12 @@ xfs_bmapi_remap(
ASSERT(got.br_startoff - bno >= len);
}
- ip->i_d.di_nblocks += len;
+ ip->i_nblocks += len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (ifp->if_flags & XFS_IFBROOT) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
got.br_startoff = bno;
@@ -4728,9 +4691,9 @@ xfs_bmapi_remap(
error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
- if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
+ if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS)
logflags &= ~XFS_ILOG_DEXT;
- else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE)
logflags &= ~XFS_ILOG_DBROOT;
if (logflags)
@@ -4834,12 +4797,12 @@ xfs_bmap_del_extent_delay(
struct xfs_bmbt_irec *del)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec new;
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
int error = 0;
bool isrt;
@@ -4867,9 +4830,8 @@ xfs_bmap_del_extent_delay(
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -((long)del->br_blockcount), 0,
- isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ ASSERT(!isrt);
+ error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
if (error)
return error;
ip->i_delayed_blks -= del->br_blockcount;
@@ -4962,10 +4924,10 @@ xfs_bmap_del_extent_cow(
struct xfs_bmbt_irec *del)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec new;
xfs_fileoff_t del_endoff, got_endoff;
- int state = BMAP_COWFORK;
+ uint32_t state = BMAP_COWFORK;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -5034,11 +4996,11 @@ xfs_bmap_del_extent_real(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
struct xfs_iext_cursor *icur,
- xfs_btree_cur_t *cur, /* if null, not a btree */
+ struct xfs_btree_cur *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
- int bflags) /* bmapi flags */
+ uint32_t bflags) /* bmapi flags */
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
@@ -5054,13 +5016,13 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(del->br_blockcount > 0);
xfs_iext_get_extent(ifp, icur, &got);
ASSERT(got.br_startoff <= del->br_startoff);
@@ -5080,28 +5042,32 @@ xfs_bmap_del_extent_real(
* conversion to btree format, since the transaction will be dirty then.
*/
if (tp->t_blk_res == 0 &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >=
- XFS_IFORK_MAXEXT(ip, whichfork) &&
+ ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) &&
del->br_startoff > got.br_startoff && del_endoff < got_endoff)
return -ENOSPC;
flags = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
xfs_filblks_t len;
xfs_extlen_t mod;
- bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize,
- &mod);
- ASSERT(mod == 0);
len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
&mod);
ASSERT(mod == 0);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
+ if (!(bflags & XFS_BMAPI_REMAP)) {
+ xfs_fsblock_t bno;
+
+ bno = div_u64_rem(del->br_startblock,
+ mp->m_sb.sb_rextsize, &mod);
+ ASSERT(mod == 0);
+
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
+ }
+
do_fx = 0;
nblks = len * mp->m_sb.sb_rextsize;
qfield = XFS_TRANS_DQ_RTBCOUNT;
@@ -5134,8 +5100,8 @@ xfs_bmap_del_extent_real(
*/
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ ifp->if_nextents--;
+
flags |= XFS_ILOG_CORE;
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
@@ -5182,6 +5148,7 @@ xfs_bmap_del_extent_real(
/*
* Deleting the middle of the extent.
*/
+
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5243,8 +5210,8 @@ xfs_bmap_del_extent_real(
}
} else
flags |= xfs_ilog_fext(whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+
+ ifp->if_nextents++;
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, &new, state);
break;
@@ -5260,7 +5227,7 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- __xfs_bmap_add_free(tp, del->br_startblock,
+ __xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
@@ -5271,7 +5238,7 @@ xfs_bmap_del_extent_real(
* Adjust inode # blocks in the file.
*/
if (nblks)
- ip->i_d.di_nblocks -= nblks;
+ ip->i_nblocks -= nblks;
/*
* Adjust quota data.
*/
@@ -5295,7 +5262,7 @@ __xfs_bunmapi(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
- int flags, /* misc flags */
+ uint32_t flags, /* misc flags */
xfs_extnum_t nexts) /* number of extents max */
{
struct xfs_btree_cur *cur; /* bmap btree cursor */
@@ -5313,8 +5280,6 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
- xfs_fileoff_t max_len;
- xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
@@ -5323,29 +5288,20 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)))
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
return -EFSCORRUPTED;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(len > 0);
ASSERT(nexts >= 0);
- /*
- * Guesstimate how many blocks we can unmap without running the risk of
- * blowing out the transaction with a mix of EFIs and reflink
- * adjustments.
- */
- if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
- max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
- else
- max_len = len;
-
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
return error;
+
if (xfs_iext_count(ifp) == 0) {
*rlen = 0;
return 0;
@@ -5361,10 +5317,10 @@ __xfs_bunmapi(
end--;
logflags = 0;
- if (ifp->if_flags & XFS_IFBROOT) {
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
} else
cur = NULL;
@@ -5380,7 +5336,7 @@ __xfs_bunmapi(
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
- (nexts == 0 || extno < nexts) && max_len > 0) {
+ (nexts == 0 || extno < nexts)) {
/*
* Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
@@ -5405,16 +5361,6 @@ __xfs_bunmapi(
del = got;
wasdel = isnullstartblock(del.br_startblock);
- /*
- * Make sure we don't touch multiple AGF headers out of order
- * in a single transaction, as that could cause AB-BA deadlocks.
- */
- if (!wasdel && !isrt) {
- agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
- if (prev_agno != NULLAGNUMBER && prev_agno > agno)
- break;
- prev_agno = agno;
- }
if (got.br_startoff < start) {
del.br_startoff = start;
del.br_blockcount -= start - got.br_startoff;
@@ -5424,14 +5370,6 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- /* How much can we safely unmap? */
- if (max_len < del.br_blockcount) {
- del.br_startoff += del.br_blockcount - max_len;
- if (!wasdel)
- del.br_startblock += del.br_blockcount - max_len;
- del.br_blockcount = max_len;
- }
-
if (!isrt)
goto delete;
@@ -5567,7 +5505,6 @@ delete:
if (error)
goto error0;
- max_len -= del.br_blockcount;
end = del.br_startoff - 1;
nodelete:
/*
@@ -5607,10 +5544,10 @@ error0:
* logging the extent records if we've converted to btree format.
*/
if ((logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ ifp->if_format != XFS_DINODE_FMT_EXTENTS)
logflags &= ~xfs_ilog_fext(whichfork);
else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
logflags &= ~xfs_ilog_fbroot(whichfork);
/*
* Log inode even in the error case, if the transaction
@@ -5620,7 +5557,7 @@ error0:
xfs_trans_log_inode(tp, ip, logflags);
if (cur) {
if (!error)
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -5633,7 +5570,7 @@ xfs_bunmapi(
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_extnum_t nexts,
int *done)
{
@@ -5665,7 +5602,7 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
return false;
return true;
@@ -5692,6 +5629,7 @@ xfs_bmse_merge(
struct xfs_btree_cur *cur,
int *logflags) /* output */
{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
@@ -5710,8 +5648,7 @@ xfs_bmse_merge(
* Update the on-disk extent count, the btree if necessary and log the
* inode.
*/
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ ifp->if_nextents--;
*logflags |= XFS_ILOG_CORE;
if (!cur) {
*logflags |= XFS_ILOG_DEXT;
@@ -5749,7 +5686,7 @@ xfs_bmse_merge(
done:
xfs_iext_remove(ip, icur, 0);
- xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
+ xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
&new);
@@ -5813,7 +5750,7 @@ xfs_bmap_collapse_extents(
{
int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got, prev;
struct xfs_iext_cursor icur;
@@ -5821,25 +5758,23 @@ xfs_bmap_collapse_extents(
int error = 0;
int logflags = 0;
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
- if (ifp->if_flags & XFS_IFBROOT) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
@@ -5906,7 +5841,7 @@ xfs_bmap_can_insert_extents(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -5930,7 +5865,7 @@ xfs_bmap_insert_extents(
{
int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got, next;
struct xfs_iext_cursor icur;
@@ -5938,25 +5873,23 @@ xfs_bmap_insert_extents(
int error = 0;
int logflags = 0;
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
- if (ifp->if_flags & XFS_IFBROOT) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
if (*next_fsb == NULLFSBLOCK) {
@@ -6025,39 +5958,36 @@ del_cursor:
* @split_fsb is a block where the extents is split. If split_fsb lies in a
* hole or the first block of extents, just return 0.
*/
-STATIC int
-xfs_bmap_split_extent_at(
+int
+xfs_bmap_split_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t split_fsb)
{
int whichfork = XFS_DATA_FORK;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec new; /* split extent */
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
xfs_fsblock_t gotblkcnt; /* new block count for got */
struct xfs_iext_cursor icur;
int error = 0;
int logflags = 0;
int i = 0;
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- /* Read in all the extents */
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ /* Read in all the extents */
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
/*
* If there are not extents, or split_fsb lies in a hole we are done.
@@ -6072,9 +6002,9 @@ xfs_bmap_split_extent_at(
new.br_blockcount = got.br_blockcount - gotblkcnt;
new.br_state = got.br_state;
- if (ifp->if_flags & XFS_IFBROOT) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
@@ -6099,8 +6029,7 @@ xfs_bmap_split_extent_at(
/* Add new extent */
xfs_iext_next(ifp, &icur);
xfs_iext_insert(ip, &icur, &new, 0);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ ifp->if_nextents++;
if (cur) {
error = xfs_bmbt_lookup_eq(cur, &new, &i);
@@ -6133,7 +6062,7 @@ xfs_bmap_split_extent_at(
del_cursor:
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
@@ -6142,34 +6071,6 @@ del_cursor:
return error;
}
-int
-xfs_bmap_split_extent(
- struct xfs_inode *ip,
- xfs_fileoff_t split_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- error = xfs_bmap_split_extent_at(tp, ip, split_fsb);
- if (error)
- goto out;
-
- return xfs_trans_commit(tp);
-
-out:
- xfs_trans_cancel(tp);
- return error;
-}
-
/* Deferred mapping is only for real extents in the data fork. */
static bool
xfs_bmap_is_update_needed(
@@ -6199,7 +6100,7 @@ __xfs_bmap_add(
bmap->br_blockcount,
bmap->br_state);
- bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
+ bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
@@ -6293,26 +6194,37 @@ xfs_bmap_validate_extent(
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fsblock_t endfsb;
- bool isrt;
- isrt = XFS_IS_REALTIME_INODE(ip);
- endfsb = irec->br_startblock + irec->br_blockcount - 1;
- if (isrt) {
- if (!xfs_verify_rtbno(mp, irec->br_startblock))
- return __this_address;
- if (!xfs_verify_rtbno(mp, endfsb))
+ if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+ return __this_address;
+
+ if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
+ if (!xfs_verify_rtext(mp, irec->br_startblock,
+ irec->br_blockcount))
return __this_address;
} else {
- if (!xfs_verify_fsbno(mp, irec->br_startblock))
- return __this_address;
- if (!xfs_verify_fsbno(mp, endfsb))
- return __this_address;
- if (XFS_FSB_TO_AGNO(mp, irec->br_startblock) !=
- XFS_FSB_TO_AGNO(mp, endfsb))
+ if (!xfs_verify_fsbext(mp, irec->br_startblock,
+ irec->br_blockcount))
return __this_address;
}
if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
return __this_address;
return NULL;
}
+
+int __init
+xfs_bmap_intent_init_cache(void)
+{
+ xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent",
+ sizeof(struct xfs_bmap_intent),
+ 0, 0, NULL);
+
+ return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_bmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_bmap_intent_cache);
+ xfs_bmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 14d25e0b7d9c..16db95b11589 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -13,8 +13,6 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
-extern kmem_zone_t *xfs_bmap_free_item_zone;
-
/*
* Argument structure for xfs_bmap_alloc.
*/
@@ -41,20 +39,7 @@ struct xfs_bmalloca {
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
int datatype;/* data type being allocated */
- int flags;
-};
-
-/*
- * List of extents to be free "later".
- * The list is kept sorted on xbf_startblock.
- */
-struct xfs_extent_free_item
-{
- xfs_fsblock_t xefi_startblock;/* starting fs block number */
- xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
- struct list_head xefi_list;
- struct xfs_owner_info xefi_oinfo; /* extent owner */
- bool xefi_skip_discard;
+ uint32_t flags;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -62,17 +47,17 @@ struct xfs_extent_free_item
/*
* Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
-#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
+#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */
/*
* unwritten extent conversion - this needs write cache flushing and no additional
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
* from written to unwritten, otherwise convert from unwritten to written.
*/
-#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_CONVERT (1u << 5)
/*
* allocate zeroed extents - this requires all newly allocated user data extents
@@ -80,7 +65,7 @@ struct xfs_extent_free_item
* Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
* during the allocation range to zeroed written extents.
*/
-#define XFS_BMAPI_ZERO 0x080
+#define XFS_BMAPI_ZERO (1u << 6)
/*
* Map the inode offset to the block given in ap->firstblock. Primarily
@@ -90,16 +75,16 @@ struct xfs_extent_free_item
* For bunmapi, this flag unmaps the range without adjusting quota, reducing
* refcount, or freeing the blocks.
*/
-#define XFS_BMAPI_REMAP 0x100
+#define XFS_BMAPI_REMAP (1u << 7)
/* Map something in the CoW fork. */
-#define XFS_BMAPI_COWFORK 0x200
+#define XFS_BMAPI_COWFORK (1u << 8)
/* Skip online discard of freed extents */
-#define XFS_BMAPI_NODISCARD 0x1000
+#define XFS_BMAPI_NODISCARD (1u << 9)
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
-#define XFS_BMAPI_NORMAP 0x2000
+#define XFS_BMAPI_NORMAP (1u << 10)
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -121,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w)
(w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
}
-static inline int xfs_bmapi_whichfork(int bmapi_flags)
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{
if (bmapi_flags & XFS_BMAPI_COWFORK)
return XFS_COW_FORK;
@@ -139,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
* Flags for xfs_bmap_add_extent*.
*/
-#define BMAP_LEFT_CONTIG (1 << 0)
-#define BMAP_RIGHT_CONTIG (1 << 1)
-#define BMAP_LEFT_FILLING (1 << 2)
-#define BMAP_RIGHT_FILLING (1 << 3)
-#define BMAP_LEFT_DELAY (1 << 4)
-#define BMAP_RIGHT_DELAY (1 << 5)
-#define BMAP_LEFT_VALID (1 << 6)
-#define BMAP_RIGHT_VALID (1 << 7)
-#define BMAP_ATTRFORK (1 << 8)
-#define BMAP_COWFORK (1 << 9)
+#define BMAP_LEFT_CONTIG (1u << 0)
+#define BMAP_RIGHT_CONTIG (1u << 1)
+#define BMAP_LEFT_FILLING (1u << 2)
+#define BMAP_RIGHT_FILLING (1u << 3)
+#define BMAP_LEFT_DELAY (1u << 4)
+#define BMAP_RIGHT_DELAY (1u << 5)
+#define BMAP_LEFT_VALID (1u << 6)
+#define BMAP_RIGHT_VALID (1u << 7)
+#define BMAP_ATTRFORK (1u << 8)
+#define BMAP_COWFORK (1u << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
@@ -158,17 +143,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
{ BMAP_ATTRFORK, "ATTR" }, \
{ BMAP_COWFORK, "COW" }
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
/*
* Return true if the extent is a real, allocated extent, or false if it is a
* delayed allocation, and unwritten extent or a hole.
*/
-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
{
- return irec->br_state != XFS_EXT_UNWRITTEN &&
- irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !isnullstartblock(irec->br_startblock);
+ return xfs_bmap_is_real_extent(irec) &&
+ irec->br_state != XFS_EXT_UNWRITTEN;
}
/*
@@ -180,13 +170,10 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
+unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
-int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork);
-void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
- xfs_filblks_t len, const struct xfs_owner_info *oinfo,
- bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -194,18 +181,17 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *last_block, int whichfork);
int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
-int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
+ int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
@@ -222,7 +208,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off,
int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
bool *done, xfs_fileoff_t stop_fsb);
-int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
@@ -234,16 +221,6 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new, int *logflagsp);
-static inline void
-xfs_bmap_add_free(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo)
-{
- __xfs_bmap_add_free(tp, bno, len, oinfo, false);
-}
-
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
XFS_BMAP_UNMAP,
@@ -252,8 +229,8 @@ enum xfs_bmap_intent_type {
struct xfs_bmap_intent {
struct list_head bi_list;
enum xfs_bmap_intent_type bi_type;
- struct xfs_inode *bi_owner;
int bi_whichfork;
+ struct xfs_inode *bi_owner;
struct xfs_bmbt_irec bi_bmap;
};
@@ -266,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-static inline int xfs_bmap_fork_to_state(int whichfork)
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
switch (whichfork) {
case XFS_ATTR_FORK:
@@ -283,6 +260,11 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
- int flags);
+ uint32_t flags);
+
+extern struct kmem_cache *xfs_bmap_intent_cache;
+
+int __init xfs_bmap_intent_init_cache(void);
+void xfs_bmap_intent_destroy_cache(void);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index ffe608d2a2d9..cfa052d40105 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -22,6 +22,8 @@
#include "xfs_trace.h"
#include "xfs_rmap.h"
+static struct kmem_cache *xfs_bmbt_cur_cache;
+
/*
* Convert on-disk form of btree root to in-memory form.
*/
@@ -58,7 +60,7 @@ xfs_bmdr_to_bmbt(
void
xfs_bmbt_disk_get_all(
- struct xfs_bmbt_rec *rec,
+ const struct xfs_bmbt_rec *rec,
struct xfs_bmbt_irec *irec)
{
uint64_t l0 = get_unaligned_be64(&rec->l0);
@@ -78,7 +80,7 @@ xfs_bmbt_disk_get_all(
*/
xfs_filblks_t
xfs_bmbt_disk_get_blockcount(
- xfs_bmbt_rec_t *r)
+ const struct xfs_bmbt_rec *r)
{
return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
}
@@ -88,7 +90,7 @@ xfs_bmbt_disk_get_blockcount(
*/
xfs_fileoff_t
xfs_bmbt_disk_get_startoff(
- xfs_bmbt_rec_t *r)
+ const struct xfs_bmbt_rec *r)
{
return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
@@ -136,7 +138,7 @@ xfs_bmbt_to_bmdr(
xfs_bmbt_key_t *tkp;
__be64 *tpp;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
&mp->m_sb.sb_meta_uuid));
@@ -166,13 +168,13 @@ xfs_bmbt_dup_cursor(
struct xfs_btree_cur *new;
new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ cur->bc_ino.ip, cur->bc_ino.whichfork);
/*
* Copy the firstblock, dfops, and flags values,
* since init cursor doesn't get them.
*/
- new->bc_private.b.flags = cur->bc_private.b.flags;
+ new->bc_ino.flags = cur->bc_ino.flags;
return new;
}
@@ -183,20 +185,20 @@ xfs_bmbt_update_cursor(
struct xfs_btree_cur *dst)
{
ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) ||
- (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+ (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
- dst->bc_private.b.allocated += src->bc_private.b.allocated;
+ dst->bc_ino.allocated += src->bc_ino.allocated;
dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock;
- src->bc_private.b.allocated = 0;
+ src->bc_ino.allocated = 0;
}
STATIC int
xfs_bmbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */
@@ -205,8 +207,8 @@ xfs_bmbt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.fsbno = cur->bc_tp->t_firstblock;
- xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
- cur->bc_private.b.whichfork);
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork);
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
@@ -230,7 +232,7 @@ xfs_bmbt_alloc_block(
}
args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
if (!args.wasdel && args.tp->t_blk_res == 0) {
error = -ENOSPC;
goto error0;
@@ -259,10 +261,10 @@ xfs_bmbt_alloc_block(
ASSERT(args.len == 1);
cur->bc_tp->t_firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+ cur->bc_ino.allocated++;
+ cur->bc_ino.ip->i_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
XFS_TRANS_DQ_BCOUNT, 1L);
new->l = cpu_to_be64(args.fsbno);
@@ -280,14 +282,14 @@ xfs_bmbt_free_block(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_trans *tp = cur->bc_tp;
- xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
struct xfs_owner_info oinfo;
- xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork);
- xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo);
- ip->i_d.di_nblocks--;
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
+ xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
+ ip->i_nblocks--;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
@@ -302,8 +304,8 @@ xfs_bmbt_get_minrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0) / 2;
@@ -320,8 +322,8 @@ xfs_bmbt_get_maxrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0);
@@ -347,13 +349,13 @@ xfs_bmbt_get_dmaxrecs(
{
if (level != cur->bc_nlevels - 1)
return cur->bc_mp->m_bmap_dmxr[level != 0];
- return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+ return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0);
}
STATIC void
xfs_bmbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->bmbt.br_startoff =
cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
@@ -361,8 +363,8 @@ xfs_bmbt_init_key_from_rec(
STATIC void
xfs_bmbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->bmbt.br_startoff = cpu_to_be64(
xfs_bmbt_disk_get_startoff(&rec->bmbt) +
@@ -387,8 +389,8 @@ xfs_bmbt_init_ptr_from_cur(
STATIC int64_t
xfs_bmbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
cur->bc_rec.b.br_startoff;
@@ -396,12 +398,12 @@ xfs_bmbt_key_diff(
STATIC int64_t
xfs_bmbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
- uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
- uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
+ uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
+ uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
/*
* Note: This routine previously casted a and b to int64 and subtracted
@@ -428,7 +430,7 @@ xfs_bmbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -497,9 +499,9 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = {
STATIC int
xfs_bmbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be64_to_cpu(k1->bmbt.br_startoff) <
be64_to_cpu(k2->bmbt.br_startoff);
@@ -507,9 +509,9 @@ xfs_bmbt_keys_inorder(
STATIC int
xfs_bmbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
@@ -548,33 +550,40 @@ xfs_bmbt_init_cursor(
struct xfs_inode *ip, /* inode owning the btree */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
+ mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- cur->bc_btnum = XFS_BTNUM_BMAP;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
- cur->bc_private.b.ip = ip;
- cur->bc_private.b.allocated = 0;
- cur->bc_private.b.flags = 0;
- cur->bc_private.b.whichfork = whichfork;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+ cur->bc_ino.ip = ip;
+ cur->bc_ino.allocated = 0;
+ cur->bc_ino.flags = 0;
+ cur->bc_ino.whichfork = whichfork;
return cur;
}
+/* Calculate number of records in a block mapping btree block. */
+static inline unsigned int
+xfs_bmbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
/*
* Calculate number of records in a bmap btree block.
*/
@@ -585,10 +594,29 @@ xfs_bmbt_maxrecs(
int leaf)
{
blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+ return xfs_bmbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_bmbt_rec_t);
- return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
+unsigned int
+xfs_bmbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
+
+ /* One extra level for the inode root. */
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}
/*
@@ -636,15 +664,10 @@ xfs_bmbt_change_owner(
ASSERT(tp || buffer_list);
ASSERT(!(tp && buffer_list));
- if (whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
- else
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+ ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
- if (!cur)
- return -ENOMEM;
- cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+ cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error);
@@ -659,3 +682,22 @@ xfs_bmbt_calc_size(
{
return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
}
+
+int __init
+xfs_bmbt_init_cur_cache(void)
+{
+ xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur",
+ xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_bmbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_bmbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_bmbt_cur_cache);
+ xfs_bmbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 29b407d053b4..3e7a40a83835 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -16,7 +16,7 @@ struct xfs_trans;
* Btree block header size depends on a superblock flag.
*/
#define XFS_BMBT_BLOCK_LEN(mp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ (xfs_has_crc(((mp))) ? \
XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
#define XFS_BMBT_REC_ADDR(mp, block, index) \
@@ -88,9 +88,10 @@ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
struct xfs_btree_block *, int);
void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
-extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
-extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r);
+void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r,
+ struct xfs_bmbt_irec *s);
extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
@@ -109,4 +110,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+unsigned int xfs_bmbt_maxlevels_ondisk(void);
+
+int __init xfs_bmbt_init_cur_cache(void);
+void xfs_bmbt_destroy_cur_cache(void);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index fd300dc93ca4..4c16c8c31fcb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -20,11 +20,13 @@
#include "xfs_trace.h"
#include "xfs_alloc.h"
#include "xfs_log.h"
-
-/*
- * Cursor allocation zone.
- */
-kmem_zone_t *xfs_btree_cur_zone;
+#include "xfs_btree_staging.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
/*
* Btree magic numbers.
@@ -50,6 +52,70 @@ xfs_btree_magic(
}
/*
+ * These sibling pointer checks are optimised for null sibling pointers. This
+ * happens a lot, and we don't need to byte swap at runtime if the sibling
+ * pointer is NULL.
+ *
+ * These are explicitly marked at inline because the cost of calling them as
+ * functions instead of inlining them is about 36 bytes extra code per call site
+ * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
+ * two sibling check functions reduces the compiled code size by over 300
+ * bytes.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_fsblock_t fsb,
+ __be64 dsibling)
+{
+ xfs_fsblock_t sibling;
+
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == fsb)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+ struct xfs_perag *pag,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_agblock_t agbno,
+ __be32 dsibling)
+{
+ xfs_agblock_t sibling;
+
+ if (dsibling == cpu_to_be32(NULLAGBLOCK))
+ return NULL;
+
+ sibling = be32_to_cpu(dsibling);
+ if (sibling == agbno)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_agbno(pag, sibling))
+ return __this_address;
+ }
+ return NULL;
+}
+
+/*
* Check a long btree block header. Return the address of the failing check,
* or NULL if everything is ok.
*/
@@ -62,13 +128,15 @@ __xfs_btree_check_lblock(
{
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb = NULLFSBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.l.bb_blkno !=
- cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
return __this_address;
if (block->bb_u.l.bb_pad != cpu_to_be32(0))
return __this_address;
@@ -81,16 +149,16 @@ __xfs_btree_check_lblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/* Check a long btree block header. */
@@ -126,14 +194,17 @@ __xfs_btree_check_sblock(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_agblock_t agbno = NULLAGBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (block->bb_u.s.bb_blkno !=
- cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
return __this_address;
}
@@ -144,16 +215,16 @@ __xfs_btree_check_sblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
}
/* Check a short btree block header. */
@@ -214,7 +285,7 @@ xfs_btree_check_sptr(
{
if (level <= 0)
return false;
- return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
+ return xfs_verify_agbno(cur->bc_ag.pag, agbno);
}
/*
@@ -223,10 +294,10 @@ xfs_btree_check_sptr(
*/
static int
xfs_btree_check_ptr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int index,
- int level)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
@@ -234,8 +305,8 @@ xfs_btree_check_ptr(
return 0;
xfs_err(cur->bc_mp,
"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_private.b.ip->i_ino,
- cur->bc_private.b.whichfork, cur->bc_btnum,
+ cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork, cur->bc_btnum,
level, index);
} else {
if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
@@ -243,7 +314,7 @@ xfs_btree_check_ptr(
return 0;
xfs_err(cur->bc_mp,
"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_private.a.agno, cur->bc_btnum,
+ cur->bc_ag.pag->pag_agno, cur->bc_btnum,
level, index);
}
@@ -271,7 +342,7 @@ xfs_btree_lblock_calc_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ if (!xfs_has_crc(bp->b_mount))
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -285,7 +356,7 @@ xfs_btree_lblock_verify_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_mount *mp = bp->b_mount;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
@@ -309,7 +380,7 @@ xfs_btree_sblock_calc_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ if (!xfs_has_crc(bp->b_mount))
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -323,7 +394,7 @@ xfs_btree_sblock_verify_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_mount *mp = bp->b_mount;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
@@ -352,37 +423,38 @@ xfs_btree_free_block(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
- int error) /* del because of error */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error) /* del because of error */
{
- int i; /* btree level */
+ int i; /* btree level */
/*
- * Clear the buffer pointers, and release the buffers.
- * If we're doing this in the face of an error, we
- * need to make sure to inspect all of the entries
- * in the bc_bufs array for buffers to be unlocked.
- * This is because some of the btree code works from
- * level n down to 0, and if we get an error along
- * the way we won't have initialized all the entries
- * down to 0.
+ * Clear the buffer pointers and release the buffers. If we're doing
+ * this because of an error, inspect all of the entries in the bc_bufs
+ * array for buffers to be unlocked. This is because some of the btree
+ * code works from level n down to 0, and if we get an error along the
+ * way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ if (cur->bc_levels[i].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
else if (!error)
break;
}
+
/*
- * Can't free a bmap cursor without having dealt with the
- * allocated indirect blocks' accounting.
- */
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_private.b.allocated == 0);
- /*
- * Free the cursor.
+ * If we are doing a BMBT update, the number of unaccounted blocks
+ * allocated during this cursor life time should be zero. If it's not
+ * zero, then we should be shut down or on our way to shutdown due to
+ * cancelling a dirty transaction on error.
*/
- kmem_cache_free(xfs_btree_cur_zone, cur);
+ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ xfs_is_shutdown(cur->bc_mp) || error != 0);
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+ kmem_free(cur->bc_ops);
+ if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
+ xfs_perag_put(cur->bc_ag.pag);
+ kmem_cache_free(cur->bc_cache, cur);
}
/*
@@ -391,14 +463,14 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur) /* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur) /* output cursor */
{
- xfs_buf_t *bp; /* btree block's buffer pointer */
+ struct xfs_buf *bp; /* btree block's buffer pointer */
int error; /* error return value */
int i; /* level number of btree block */
xfs_mount_t *mp; /* mount structure for filesystem */
- xfs_btree_cur_t *new; /* new cursor value */
+ struct xfs_btree_cur *new; /* new cursor value */
xfs_trans_t *tp; /* transaction pointer, can be NULL */
tp = cur->bc_tp;
@@ -418,12 +490,12 @@ xfs_btree_dup_cursor(
* For each level current, re-get the buffer and copy the ptr value.
*/
for (i = 0; i < new->bc_nlevels; i++) {
- new->bc_ptrs[i] = cur->bc_ptrs[i];
- new->bc_ra[i] = cur->bc_ra[i];
- bp = cur->bc_bufs[i];
+ new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
+ new->bc_levels[i].ra = cur->bc_levels[i].ra;
+ bp = cur->bc_levels[i].bp;
if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_BUF_ADDR(bp), mp->m_bsize,
+ xfs_buf_daddr(bp), mp->m_bsize,
0, &bp,
cur->bc_ops->buf_ops);
if (error) {
@@ -432,7 +504,7 @@ xfs_btree_dup_cursor(
return error;
}
}
- new->bc_bufs[i] = bp;
+ new->bc_levels[i].bp = bp;
}
*ncur = new;
return 0;
@@ -642,6 +714,17 @@ xfs_btree_ptr_addr(
((char *)block + xfs_btree_ptr_offset(cur, n, level));
}
+struct xfs_ifork *
+xfs_btree_ifork_ptr(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ if (cur->bc_flags & XFS_BTREE_STAGING)
+ return cur->bc_ino.ifake->if_fork;
+ return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
+}
+
/*
* Get the root block which is stored in the inode.
*
@@ -652,9 +735,8 @@ STATIC struct xfs_btree_block *
xfs_btree_get_iroot(
struct xfs_btree_cur *cur)
{
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
return (struct xfs_btree_block *)ifp->if_broot;
}
@@ -674,7 +756,7 @@ xfs_btree_get_block(
return xfs_btree_get_iroot(cur);
}
- *bpp = cur->bc_bufs[level];
+ *bpp = cur->bc_levels[level].bp;
return XFS_BUF_TO_BLOCK(*bpp);
}
@@ -684,11 +766,11 @@ xfs_btree_get_block(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
- xfs_buf_t *bp; /* buffer containing block */
+ struct xfs_buf *bp; /* buffer containing block */
/*
* Get the block pointer for this level.
@@ -704,7 +786,7 @@ xfs_btree_firstrec(
/*
* Set the ptr value to 1, that's the first record/key.
*/
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
return 1;
}
@@ -714,11 +796,11 @@ xfs_btree_firstrec(
*/
STATIC int /* success=1, failure=0 */
xfs_btree_lastrec(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level) /* level to change */
{
struct xfs_btree_block *block; /* generic btree block pointer */
- xfs_buf_t *bp; /* buffer containing block */
+ struct xfs_buf *bp; /* buffer containing block */
/*
* Get the block pointer for this level.
@@ -734,7 +816,7 @@ xfs_btree_lastrec(
/*
* Set the ptr value to numrecs, that's the last record/key.
*/
- cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
return 1;
}
@@ -744,20 +826,20 @@ xfs_btree_lastrec(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets, /* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last) /* output: last byte offset */
{
int i; /* current bit number */
- int64_t imask; /* mask for current bit number */
+ uint32_t imask; /* mask for current bit number */
ASSERT(fields != 0);
/*
* Find the lowest bit, so the first byte offset.
*/
- for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+ for (i = 0, imask = 1u; ; i++, imask <<= 1) {
if (imask & fields) {
*first = offsets[i];
break;
@@ -766,7 +848,7 @@ xfs_btree_offsets(
/*
* Find the highest bit, so the last byte offset.
*/
- for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
if (imask & fields) {
*last = offsets[i + 1] - 1;
break;
@@ -881,13 +963,13 @@ xfs_btree_readahead_sblock(
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
left, 1, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
right, 1, cur->bc_ops->buf_ops);
rval++;
}
@@ -915,11 +997,11 @@ xfs_btree_readahead(
(lev == cur->bc_nlevels - 1))
return 0;
- if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
return 0;
- cur->bc_ra[lev] |= lr;
- block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+ cur->bc_levels[lev].ra |= lr;
+ block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return xfs_btree_readahead_lblock(cur, lr, block);
@@ -928,9 +1010,9 @@ xfs_btree_readahead(
STATIC int
xfs_btree_ptr_to_daddr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- xfs_daddr_t *daddr)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ xfs_daddr_t *daddr)
{
xfs_fsblock_t fsbno;
xfs_agblock_t agbno;
@@ -945,7 +1027,7 @@ xfs_btree_ptr_to_daddr(
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
} else {
agbno = be32_to_cpu(ptr->s);
- *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno);
}
@@ -978,35 +1060,35 @@ xfs_btree_readahead_ptr(
*/
STATIC void
xfs_btree_setbuf(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int lev, /* level in btree */
- xfs_buf_t *bp) /* new buffer to set */
+ struct xfs_buf *bp) /* new buffer to set */
{
struct xfs_btree_block *b; /* btree block */
- if (cur->bc_bufs[lev])
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
- cur->bc_bufs[lev] = bp;
- cur->bc_ra[lev] = 0;
+ if (cur->bc_levels[lev].bp)
+ xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
+ cur->bc_levels[lev].bp = bp;
+ cur->bc_levels[lev].ra = 0;
b = XFS_BUF_TO_BLOCK(bp);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
} else {
if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
- cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
}
}
bool
xfs_btree_ptr_is_null(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return ptr->l == cpu_to_be64(NULLFSBLOCK);
@@ -1014,7 +1096,7 @@ xfs_btree_ptr_is_null(
return ptr->s == cpu_to_be32(NULLAGBLOCK);
}
-STATIC void
+void
xfs_btree_set_ptr_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
@@ -1050,12 +1132,12 @@ xfs_btree_get_sibling(
}
}
-STATIC void
+void
xfs_btree_set_sibling(
- struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- union xfs_btree_ptr *ptr,
- int lr)
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ const union xfs_btree_ptr *ptr,
+ int lr)
{
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
@@ -1083,7 +1165,7 @@ xfs_btree_init_block_int(
__u64 owner,
unsigned int flags)
{
- int crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_has_crc(mp);
__u32 magic = xfs_btree_magic(crc, btnum);
buf->bb_magic = cpu_to_be32(magic);
@@ -1124,11 +1206,11 @@ xfs_btree_init_block(
__u16 numrecs,
__u64 owner)
{
- xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+ xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
btnum, level, numrecs, owner, 0);
}
-STATIC void
+void
xfs_btree_init_block_cur(
struct xfs_btree_cur *cur,
struct xfs_buf *bp,
@@ -1144,13 +1226,13 @@ xfs_btree_init_block_cur(
* code.
*/
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- owner = cur->bc_private.b.ip->i_ino;
+ owner = cur->bc_ino.ip->i_ino;
else
- owner = cur->bc_private.a.agno;
+ owner = cur->bc_ag.pag->pag_agno;
- xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
- cur->bc_btnum, level, numrecs,
- owner, cur->bc_flags);
+ xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
+ xfs_buf_daddr(bp), cur->bc_btnum, level,
+ numrecs, owner, cur->bc_flags);
}
/*
@@ -1185,10 +1267,10 @@ xfs_btree_buf_to_ptr(
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
- XFS_BUF_ADDR(bp)));
+ xfs_buf_daddr(bp)));
else {
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
- XFS_BUF_ADDR(bp)));
+ xfs_buf_daddr(bp)));
}
}
@@ -1220,12 +1302,12 @@ xfs_btree_set_refs(
}
}
-STATIC int
+int
xfs_btree_get_buf_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- struct xfs_btree_block **block,
- struct xfs_buf **bpp)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = cur->bc_mp;
xfs_daddr_t d;
@@ -1250,11 +1332,11 @@ xfs_btree_get_buf_block(
*/
STATIC int
xfs_btree_read_buf_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int flags,
- struct xfs_btree_block **block,
- struct xfs_buf **bpp)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
{
struct xfs_mount *mp = cur->bc_mp;
xfs_daddr_t d;
@@ -1280,12 +1362,12 @@ xfs_btree_read_buf_block(
/*
* Copy keys from one btree block to another.
*/
-STATIC void
+void
xfs_btree_copy_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *dst_key,
- union xfs_btree_key *src_key,
- int numkeys)
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ const union xfs_btree_key *src_key,
+ int numkeys)
{
ASSERT(numkeys >= 0);
memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
@@ -1308,11 +1390,11 @@ xfs_btree_copy_recs(
/*
* Copy block pointers from one btree block to another.
*/
-STATIC void
+void
xfs_btree_copy_ptrs(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *dst_ptr,
- union xfs_btree_ptr *src_ptr,
+ const union xfs_btree_ptr *src_ptr,
int numptrs)
{
ASSERT(numptrs >= 0);
@@ -1393,8 +1475,8 @@ xfs_btree_log_keys(
xfs_btree_key_offset(cur, first),
xfs_btree_key_offset(cur, last + 1) - 1);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1436,8 +1518,8 @@ xfs_btree_log_ptrs(
xfs_btree_ptr_offset(cur, first, level),
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1449,7 +1531,7 @@ void
xfs_btree_log_block(
struct xfs_btree_cur *cur, /* btree cursor */
struct xfs_buf *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+ uint32_t fields) /* mask of fields: XFS_BB_... */
{
int first; /* first byte offset logged */
int last; /* last byte offset logged */
@@ -1505,8 +1587,8 @@ xfs_btree_log_block(
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1541,7 +1623,7 @@ xfs_btree_increment(
#endif
/* We're done if we remain in the block after the increment. */
- if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
goto out1;
/* Fail if we just went off the right edge of the tree. */
@@ -1564,7 +1646,7 @@ xfs_btree_increment(
goto error0;
#endif
- if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
break;
/* Read-ahead the right block for the next loop. */
@@ -1591,14 +1673,14 @@ xfs_btree_increment(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = 1;
+ cur->bc_levels[lev].ptr = 1;
}
out1:
*stat = 1;
@@ -1623,7 +1705,7 @@ xfs_btree_decrement(
int *stat) /* success/failure */
{
struct xfs_btree_block *block;
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
int error; /* error return value */
int lev;
union xfs_btree_ptr ptr;
@@ -1634,7 +1716,7 @@ xfs_btree_decrement(
xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
/* We're done if we remain in the block after the decrement. */
- if (--cur->bc_ptrs[level] > 0)
+ if (--cur->bc_levels[level].ptr > 0)
goto out1;
/* Get a pointer to the btree block. */
@@ -1658,7 +1740,7 @@ xfs_btree_decrement(
* Stop when we don't go off the left edge of a block.
*/
for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
+ if (--cur->bc_levels[lev].ptr > 0)
break;
/* Read-ahead the left block for the next loop. */
xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
@@ -1684,13 +1766,13 @@ xfs_btree_decrement(
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
union xfs_btree_ptr *ptrp;
- ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
--lev;
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
- cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
}
out1:
*stat = 1;
@@ -1706,10 +1788,10 @@ error0:
int
xfs_btree_lookup_get_block(
- struct xfs_btree_cur *cur, /* btree cursor */
- int level, /* level in the btree */
- union xfs_btree_ptr *pp, /* ptr to btree block */
- struct xfs_btree_block **blkp) /* return btree block */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in the btree */
+ const union xfs_btree_ptr *pp, /* ptr to btree block */
+ struct xfs_btree_block **blkp) /* return btree block */
{
struct xfs_buf *bp; /* buffer pointer for btree block */
xfs_daddr_t daddr;
@@ -1728,11 +1810,11 @@ xfs_btree_lookup_get_block(
*
* Otherwise throw it away and get a new one.
*/
- bp = cur->bc_bufs[level];
+ bp = cur->bc_levels[level].bp;
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
if (error)
return error;
- if (bp && XFS_BUF_ADDR(bp) == daddr) {
+ if (bp && xfs_buf_daddr(bp) == daddr) {
*blkp = XFS_BUF_TO_BLOCK(bp);
return 0;
}
@@ -1742,11 +1824,11 @@ xfs_btree_lookup_get_block(
return error;
/* Check the inode owner since the verifiers don't. */
- if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
- !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+ if (xfs_has_crc(cur->bc_mp) &&
+ !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
- cur->bc_private.b.ip->i_ino)
+ cur->bc_ino.ip->i_ino)
goto out_bad;
/* Did we get the level we were looking for? */
@@ -1762,7 +1844,7 @@ xfs_btree_lookup_get_block(
out_bad:
*blkp = NULL;
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
return -EFSCORRUPTED;
}
@@ -1857,7 +1939,7 @@ xfs_btree_lookup(
return -EFSCORRUPTED;
}
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
*stat = 0;
return 0;
}
@@ -1909,7 +1991,7 @@ xfs_btree_lookup(
if (error)
goto error0;
- cur->bc_ptrs[level] = keyno;
+ cur->bc_levels[level].ptr = keyno;
}
}
@@ -1926,7 +2008,7 @@ xfs_btree_lookup(
!xfs_btree_ptr_is_null(cur, &ptr)) {
int i;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
@@ -1937,7 +2019,7 @@ xfs_btree_lookup(
}
} else if (dir == XFS_LOOKUP_LE && diff > 0)
keyno--;
- cur->bc_ptrs[0] = keyno;
+ cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
@@ -2097,7 +2179,7 @@ __xfs_btree_updkeys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
nlkey = xfs_btree_key_addr(cur, ptr, block);
nhkey = xfs_btree_high_key_addr(cur, ptr, block);
if (!force_all &&
@@ -2164,7 +2246,7 @@ xfs_btree_update_keys(
if (error)
return error;
#endif
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
kp = xfs_btree_key_addr(cur, ptr, block);
xfs_btree_copy_keys(cur, kp, &key, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
@@ -2198,7 +2280,7 @@ xfs_btree_update(
goto error0;
#endif
/* Get the address of the rec to be updated. */
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
rp = xfs_btree_rec_addr(cur, ptr, block);
/* Fill in the new contents and log them. */
@@ -2273,7 +2355,7 @@ xfs_btree_lshift(
* If the cursor entry is the one that would be moved, don't
* do it... it's too complicated.
*/
- if (cur->bc_ptrs[level] <= 1)
+ if (cur->bc_levels[level].ptr <= 1)
goto out0;
/* Set up the left neighbor as "left". */
@@ -2407,7 +2489,7 @@ xfs_btree_lshift(
goto error0;
/* Slide the cursor value left one. */
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
*stat = 1;
return 0;
@@ -2469,7 +2551,7 @@ xfs_btree_rshift(
* do it... it's too complicated.
*/
lrecs = xfs_btree_get_numrecs(left);
- if (cur->bc_ptrs[level] >= lrecs)
+ if (cur->bc_levels[level].ptr >= lrecs)
goto out0;
/* Set up the right neighbor as "right". */
@@ -2657,7 +2739,7 @@ __xfs_btree_split(
*/
lrecs = xfs_btree_get_numrecs(left);
rrecs = lrecs / 2;
- if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
rrecs++;
src_index = (lrecs - rrecs + 1);
@@ -2753,9 +2835,9 @@ __xfs_btree_split(
* If it's just pointing past the last entry in left, then we'll
* insert there, so don't change anything in that case.
*/
- if (cur->bc_ptrs[level] > lrecs + 1) {
+ if (cur->bc_levels[level].ptr > lrecs + 1) {
xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= lrecs;
+ cur->bc_levels[level].ptr -= lrecs;
}
/*
* If there are more levels, we'll need another cursor which refers
@@ -2765,7 +2847,7 @@ __xfs_btree_split(
error = xfs_btree_dup_cursor(cur, curp);
if (error)
goto error0;
- (*curp)->bc_ptrs[level + 1]++;
+ (*curp)->bc_levels[level + 1].ptr++;
}
*ptrp = rptr;
*stat = 1;
@@ -2778,6 +2860,7 @@ error0:
return error;
}
+#ifdef __KERNEL__
struct xfs_btree_split_args {
struct xfs_btree_cur *cur;
int level;
@@ -2801,7 +2884,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
+ unsigned long new_pflags = 0;
/*
* we are in a transaction context here, but may also be doing work
@@ -2810,15 +2893,23 @@ xfs_btree_split_worker(
* in any way.
*/
if (args->kswapd)
- new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ new_pflags |= PF_MEMALLOC | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
+ xfs_trans_set_context(args->cur->bc_tp);
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
args->key, args->curp, args->stat);
- complete(args->done);
+ xfs_trans_clear_context(args->cur->bc_tp);
current_restore_flags_nested(&pflags, new_pflags);
+
+ /*
+ * Do not access args after complete() has run here. We don't own args
+ * and the owner may run and free args before we return here.
+ */
+ complete(args->done);
+
}
/*
@@ -2855,6 +2946,9 @@ xfs_btree_split(
destroy_work_on_stack(&args.work);
return args.result;
}
+#else
+#define xfs_btree_split __xfs_btree_split
+#endif /* __KERNEL__ */
/*
@@ -2908,16 +3002,18 @@ xfs_btree_new_iroot(
*/
memcpy(cblock, block, xfs_btree_block_len(cur));
if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+ __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+ cblock->bb_u.l.bb_blkno = bno;
else
- cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+ cblock->bb_u.s.bb_blkno = bno;
}
be16_add_cpu(&block->bb_level, 1);
xfs_btree_set_numrecs(block, 1);
cur->bc_nlevels++;
- cur->bc_ptrs[level + 1] = 1;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ cur->bc_levels[level + 1].ptr = 1;
kp = xfs_btree_key_addr(cur, 1, block);
ckp = xfs_btree_key_addr(cur, 1, cblock);
@@ -2938,9 +3034,9 @@ xfs_btree_new_iroot(
xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
- xfs_iroot_realloc(cur->bc_private.b.ip,
+ xfs_iroot_realloc(cur->bc_ino.ip,
1 - xfs_btree_get_numrecs(cblock),
- cur->bc_private.b.whichfork);
+ cur->bc_ino.whichfork);
xfs_btree_setbuf(cur, level, cbp);
@@ -2953,7 +3049,7 @@ xfs_btree_new_iroot(
xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
*logflags |=
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
*stat = 1;
return 0;
error0:
@@ -3078,8 +3174,9 @@ xfs_btree_new_root(
/* Fix up the cursor. */
xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_levels[cur->bc_nlevels].ptr = nptr;
cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
*stat = 1;
return 0;
error0:
@@ -3105,11 +3202,11 @@ xfs_btree_make_block_unfull(
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
level == cur->bc_nlevels - 1) {
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
- xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
*stat = 1;
} else {
/* A root block that needs replacing */
@@ -3136,7 +3233,7 @@ xfs_btree_make_block_unfull(
return error;
if (*stat) {
- *oindex = *index = cur->bc_ptrs[level];
+ *oindex = *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3151,7 +3248,7 @@ xfs_btree_make_block_unfull(
return error;
- *index = cur->bc_ptrs[level];
+ *index = cur->bc_levels[level].ptr;
return 0;
}
@@ -3172,7 +3269,7 @@ xfs_btree_insrec(
struct xfs_btree_block *block; /* btree block */
struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */
- struct xfs_btree_cur *ncur; /* new btree cursor */
+ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey;
int optr; /* old key/record index */
@@ -3198,7 +3295,7 @@ xfs_btree_insrec(
}
/* If we're off the left edge, return failure. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3210,7 +3307,7 @@ xfs_btree_insrec(
/* Get pointers to the btree buffer and block. */
block = xfs_btree_get_block(cur, level, &bp);
- old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
+ old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
numrecs = xfs_btree_get_numrecs(block);
#ifdef DEBUG
@@ -3252,7 +3349,7 @@ xfs_btree_insrec(
#ifdef DEBUG
error = xfs_btree_check_block(cur, block, level, bp);
if (error)
- return error;
+ goto error0;
#endif
/*
@@ -3272,7 +3369,7 @@ xfs_btree_insrec(
for (i = numrecs - ptr; i >= 0; i--) {
error = xfs_btree_debug_check_ptr(cur, pp, i, level);
if (error)
- return error;
+ goto error0;
}
xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
@@ -3326,7 +3423,7 @@ xfs_btree_insrec(
* some records into the new tree block), so use the regular key
* update mechanism.
*/
- if (bp && bp->b_bn != old_bn) {
+ if (bp && xfs_buf_daddr(bp) != old_bn) {
xfs_btree_get_keys(cur, block, lkey);
} else if (xfs_btree_needs_key_update(cur, optr)) {
error = xfs_btree_update_keys(cur, level);
@@ -3357,6 +3454,8 @@ xfs_btree_insrec(
return 0;
error0:
+ if (ncur)
+ xfs_btree_del_cursor(ncur, error);
return error;
}
@@ -3455,9 +3554,9 @@ STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
- int whichfork = cur->bc_private.b.whichfork;
- struct xfs_inode *ip = cur->bc_private.b.ip;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
union xfs_btree_key *kp;
@@ -3514,8 +3613,8 @@ xfs_btree_kill_iroot(
index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
if (index) {
- xfs_iroot_realloc(cur->bc_private.b.ip, index,
- cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(cur->bc_ino.ip, index,
+ cur->bc_ino.whichfork);
block = ifp->if_broot;
}
@@ -3541,10 +3640,10 @@ xfs_btree_kill_iroot(
if (error)
return error;
- cur->bc_bufs[level - 1] = NULL;
+ cur->bc_levels[level - 1].bp = NULL;
be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
cur->bc_nlevels--;
out0:
return 0;
@@ -3574,8 +3673,8 @@ xfs_btree_kill_root(
if (error)
return error;
- cur->bc_bufs[level] = NULL;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = NULL;
+ cur->bc_levels[level].ra = 0;
cur->bc_nlevels--;
return 0;
@@ -3634,7 +3733,7 @@ xfs_btree_delrec(
tcur = NULL;
/* Get the index of the entry being deleted, check for nothing there. */
- ptr = cur->bc_ptrs[level];
+ ptr = cur->bc_levels[level].ptr;
if (ptr == 0) {
*stat = 0;
return 0;
@@ -3712,8 +3811,8 @@ xfs_btree_delrec(
*/
if (level == cur->bc_nlevels - 1) {
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
- xfs_iroot_realloc(cur->bc_private.b.ip, -1,
- cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(cur->bc_ino.ip, -1,
+ cur->bc_ino.whichfork);
error = xfs_btree_kill_iroot(cur);
if (error)
@@ -3944,7 +4043,7 @@ xfs_btree_delrec(
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
tcur = NULL;
if (level == 0)
- cur->bc_ptrs[0]++;
+ cur->bc_levels[0].ptr++;
*stat = 1;
return 0;
@@ -4057,7 +4156,7 @@ xfs_btree_delrec(
* surviving block, and log it.
*/
xfs_btree_set_numrecs(left, lrecs + rrecs);
- xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+ xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
@@ -4081,9 +4180,9 @@ xfs_btree_delrec(
* cursor to the left block, and fix up the index.
*/
if (bp != lbp) {
- cur->bc_bufs[level] = lbp;
- cur->bc_ptrs[level] += lrecs;
- cur->bc_ra[level] = 0;
+ cur->bc_levels[level].bp = lbp;
+ cur->bc_levels[level].ptr += lrecs;
+ cur->bc_levels[level].ra = 0;
}
/*
* If we joined with the right neighbor and there's a level above
@@ -4103,16 +4202,16 @@ xfs_btree_delrec(
* We can't use decrement because it would change the next level up.
*/
if (level > 0)
- cur->bc_ptrs[level]--;
+ cur->bc_levels[level].ptr--;
/*
* We combined blocks, so we have to update the parent keys if the
- * btree supports overlapped intervals. However, bc_ptrs[level + 1]
- * points to the old block so that the caller knows which record to
- * delete. Therefore, the caller must be savvy enough to call updkeys
- * for us if we return stat == 2. The other exit points from this
- * function don't require deletions further up the tree, so they can
- * call updkeys directly.
+ * btree supports overlapped intervals. However,
+ * bc_levels[level + 1].ptr points to the old block so that the caller
+ * knows which record to delete. Therefore, the caller must be savvy
+ * enough to call updkeys for us if we return stat == 2. The other
+ * exit points from this function don't require deletions further up
+ * the tree, so they can call updkeys directly.
*/
/* Return value means the next level up has something to do. */
@@ -4166,7 +4265,7 @@ xfs_btree_delete(
if (i == 0) {
for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
+ if (cur->bc_levels[level].ptr == 0) {
error = xfs_btree_decrement(cur, level, &i);
if (error)
goto error0;
@@ -4197,7 +4296,7 @@ xfs_btree_get_rec(
int error; /* error return value */
#endif
- ptr = cur->bc_ptrs[0];
+ ptr = cur->bc_levels[0].ptr;
block = xfs_btree_get_block(cur, 0, &bp);
#ifdef DEBUG
@@ -4249,6 +4348,21 @@ xfs_btree_visit_block(
if (xfs_btree_ptr_is_null(cur, &rptr))
return -ENOENT;
+ /*
+ * We only visit blocks once in this walk, so we have to avoid the
+ * internal xfs_btree_lookup_get_block() optimisation where it will
+ * return the same block without checking if the right sibling points
+ * back to us and creates a cyclic reference in the btree.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ } else {
+ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ }
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4403,11 +4517,11 @@ xfs_btree_lblock_v5hdr_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return __this_address;
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
+ if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
if (owner != XFS_RMAP_OWN_UNKNOWN &&
be64_to_cpu(block->bb_u.l.bb_owner) != owner)
@@ -4423,20 +4537,21 @@ xfs_btree_lblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_fsblock_t fsb;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/**
@@ -4453,11 +4568,11 @@ xfs_btree_sblock_v5hdr_verify(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return __this_address;
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
return __this_address;
@@ -4477,40 +4592,94 @@ xfs_btree_sblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_agblock_t agno;
+ xfs_agblock_t agbno;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
}
/*
- * Calculate the number of btree levels needed to store a given number of
- * records in a short-format btree.
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * height of the tree needed to index the number of leaf records.
*/
-uint
+unsigned int
xfs_btree_compute_maxlevels(
- uint *limits,
- unsigned long len)
+ const unsigned int *limits,
+ unsigned long long records)
{
- uint level;
- unsigned long maxblocks;
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned int height = 1;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ height++;
+ }
- maxblocks = (len + limits[0] - 1) / limits[0];
- for (level = 1; maxblocks > 1; level++)
- maxblocks = (maxblocks + limits[1] - 1) / limits[1];
- return level;
+ return height;
+}
+
+/*
+ * For the given limits on leaf and keyptr records per block, calculate the
+ * number of blocks needed to index the given number of leaf records.
+ */
+unsigned long long
+xfs_btree_calc_size(
+ const unsigned int *limits,
+ unsigned long long records)
+{
+ unsigned long long level_blocks = howmany_64(records, limits[0]);
+ unsigned long long blocks = level_blocks;
+
+ while (level_blocks > 1) {
+ level_blocks = howmany_64(level_blocks, limits[1]);
+ blocks += level_blocks;
+ }
+
+ return blocks;
+}
+
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+ const unsigned int *limits,
+ unsigned long long leaf_blocks)
+{
+ unsigned long long node_blocks = limits[1];
+ unsigned long long blocks_left = leaf_blocks - 1;
+ unsigned int height = 1;
+
+ if (leaf_blocks < 1)
+ return 0;
+
+ while (node_blocks < blocks_left) {
+ blocks_left -= node_blocks;
+ node_blocks *= limits[1];
+ height++;
+ }
+
+ return height;
}
/*
@@ -4521,8 +4690,8 @@ xfs_btree_compute_maxlevels(
STATIC int
xfs_btree_simple_query_range(
struct xfs_btree_cur *cur,
- union xfs_btree_key *low_key,
- union xfs_btree_key *high_key,
+ const union xfs_btree_key *low_key,
+ const union xfs_btree_key *high_key,
xfs_btree_query_range_fn fn,
void *priv)
{
@@ -4612,8 +4781,8 @@ out:
STATIC int
xfs_btree_overlapped_query_range(
struct xfs_btree_cur *cur,
- union xfs_btree_key *low_key,
- union xfs_btree_key *high_key,
+ const union xfs_btree_key *low_key,
+ const union xfs_btree_key *high_key,
xfs_btree_query_range_fn fn,
void *priv)
{
@@ -4645,23 +4814,25 @@ xfs_btree_overlapped_query_range(
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
pop_up:
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
if (level == 0) {
/* Handle a leaf node. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
@@ -4684,14 +4855,15 @@ pop_up:
/* Record is larger than high key; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* Handle an internal node. */
- lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
- hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
+ hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
+ block);
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
@@ -4714,13 +4886,13 @@ pop_up:
if (error)
goto out;
#endif
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
continue;
} else if (hdiff < 0) {
/* The low key is larger than the upper range; pop. */
goto pop_up;
}
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
}
out:
@@ -4731,13 +4903,14 @@ out:
* with a zero-results range query, so release the buffers if we
* failed to return any results.
*/
- if (cur->bc_bufs[0] == NULL) {
+ if (cur->bc_levels[0].bp == NULL) {
for (i = 0; i < cur->bc_nlevels; i++) {
- if (cur->bc_bufs[i]) {
- xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
- cur->bc_bufs[i] = NULL;
- cur->bc_ptrs[i] = 0;
- cur->bc_ra[i] = 0;
+ if (cur->bc_levels[i].bp) {
+ xfs_trans_brelse(cur->bc_tp,
+ cur->bc_levels[i].bp);
+ cur->bc_levels[i].bp = NULL;
+ cur->bc_levels[i].ptr = 0;
+ cur->bc_levels[i].ra = 0;
}
}
}
@@ -4754,8 +4927,8 @@ out:
int
xfs_btree_query_range(
struct xfs_btree_cur *cur,
- union xfs_btree_irec *low_rec,
- union xfs_btree_irec *high_rec,
+ const union xfs_btree_irec *low_rec,
+ const union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn,
void *priv)
{
@@ -4800,29 +4973,6 @@ xfs_btree_query_all(
return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
}
-/*
- * Calculate the number of blocks needed to store a given number of records
- * in a short-format (per-AG metadata) btree.
- */
-unsigned long long
-xfs_btree_calc_size(
- uint *limits,
- unsigned long long len)
-{
- int level;
- int maxrecs;
- unsigned long long rval;
-
- maxrecs = limits[0];
- for (level = 0, rval = 0; len > 1; level++) {
- len += maxrecs - 1;
- do_div(len, maxrecs);
- maxrecs = limits[1];
- rval += len;
- }
- return rval;
-}
-
static int
xfs_btree_count_blocks_helper(
struct xfs_btree_cur *cur,
@@ -4862,7 +5012,7 @@ xfs_btree_diff_two_ptrs(
STATIC int
xfs_btree_has_record_helper(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
return -ECANCELED;
@@ -4871,12 +5021,12 @@ xfs_btree_has_record_helper(
/* Is there a record covering a given range of keys? */
int
xfs_btree_has_record(
- struct xfs_btree_cur *cur,
- union xfs_btree_irec *low,
- union xfs_btree_irec *high,
- bool *exists)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low,
+ const union xfs_btree_irec *high,
+ bool *exists)
{
- int error;
+ int error;
error = xfs_btree_query_range(cur, low, high,
&xfs_btree_has_record_helper, NULL);
@@ -4899,7 +5049,7 @@ xfs_btree_has_more_records(
block = xfs_btree_get_block(cur, 0, &bp);
/* There are still records in this block. */
- if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+ if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
return true;
/* There are more record blocks. */
@@ -4908,3 +5058,42 @@ xfs_btree_has_more_records(
else
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
}
+
+/* Set up all the btree cursor caches. */
+int __init
+xfs_btree_init_cur_caches(void)
+{
+ int error;
+
+ error = xfs_allocbt_init_cur_cache();
+ if (error)
+ return error;
+ error = xfs_inobt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_bmbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_rmapbt_init_cur_cache();
+ if (error)
+ goto err;
+ error = xfs_refcountbt_init_cur_cache();
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xfs_btree_destroy_cur_caches();
+ return error;
+}
+
+/* Destroy all the btree cursor caches, if they've been allocated. */
+void
+xfs_btree_destroy_cur_caches(void)
+{
+ xfs_allocbt_destroy_cur_cache();
+ xfs_inobt_destroy_cur_cache();
+ xfs_bmbt_destroy_cur_cache();
+ xfs_rmapbt_destroy_cur_cache();
+ xfs_refcountbt_destroy_cur_cache();
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3eff7c321d43..eef27858a013 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -10,8 +10,8 @@ struct xfs_buf;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
-
-extern kmem_zone_t *xfs_btree_cur_zone;
+struct xfs_ifork;
+struct xfs_perag;
/*
* Generic key, ptr and record wrapper structures.
@@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
*/
-#define XFS_BB_MAGIC (1 << 0)
-#define XFS_BB_LEVEL (1 << 1)
-#define XFS_BB_NUMRECS (1 << 2)
-#define XFS_BB_LEFTSIB (1 << 3)
-#define XFS_BB_RIGHTSIB (1 << 4)
-#define XFS_BB_BLKNO (1 << 5)
-#define XFS_BB_LSN (1 << 6)
-#define XFS_BB_UUID (1 << 7)
-#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_MAGIC (1u << 0)
+#define XFS_BB_LEVEL (1u << 1)
+#define XFS_BB_NUMRECS (1u << 2)
+#define XFS_BB_LEFTSIB (1u << 3)
+#define XFS_BB_RIGHTSIB (1u << 4)
+#define XFS_BB_BLKNO (1u << 5)
+#define XFS_BB_LSN (1u << 6)
+#define XFS_BB_UUID (1u << 7)
+#define XFS_BB_OWNER (1u << 8)
#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
#define XFS_BB_NUM_BITS_CRC 9
-#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -90,8 +90,6 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
-#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
-
struct xfs_btree_ops {
/* size of the key and record structures */
size_t key_len;
@@ -104,19 +102,19 @@ struct xfs_btree_ops {
/* update btree root pointer */
void (*set_root)(struct xfs_btree_cur *cur,
- union xfs_btree_ptr *nptr, int level_change);
+ const union xfs_btree_ptr *nptr, int level_change);
/* block allocation / freeing */
int (*alloc_block)(struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start_bno,
+ const union xfs_btree_ptr *start_bno,
union xfs_btree_ptr *new_bno,
int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* update last record information */
void (*update_lastrec)(struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- union xfs_btree_rec *rec,
+ const struct xfs_btree_block *block,
+ const union xfs_btree_rec *rec,
int ptr, int reason);
/* records in block/level */
@@ -128,37 +126,37 @@ struct xfs_btree_ops {
/* init values of btree structures */
void (*init_key_from_rec)(union xfs_btree_key *key,
- union xfs_btree_rec *rec);
+ const union xfs_btree_rec *rec);
void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec);
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
void (*init_high_key_from_rec)(union xfs_btree_key *key,
- union xfs_btree_rec *rec);
+ const union xfs_btree_rec *rec);
/* difference between key value and cursor value */
int64_t (*key_diff)(struct xfs_btree_cur *cur,
- union xfs_btree_key *key);
+ const union xfs_btree_key *key);
/*
* Difference between key2 and key1 -- positive if key1 > key2,
* negative if key1 < key2, and zero if equal.
*/
int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
- union xfs_btree_key *key1,
- union xfs_btree_key *key2);
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2);
const struct xfs_buf_ops *buf_ops;
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2);
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2);
/* check that r1 is lower than r2 */
int (*recs_inorder)(struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2);
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2);
};
/*
@@ -177,54 +175,94 @@ union xfs_btree_irec {
struct xfs_refcount_irec rc;
};
-/* Per-AG btree private information. */
-union xfs_btree_cur_private {
- struct {
- unsigned long nr_ops; /* # record updates */
- int shape_changes; /* # of extent splits */
- } refc;
- struct {
- bool active; /* allocation cursor state */
- } abt;
+/* Per-AG btree information. */
+struct xfs_btree_cur_ag {
+ struct xfs_perag *pag;
+ union {
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ };
+ union {
+ struct {
+ unsigned int nr_ops; /* # record updates */
+ unsigned int shape_changes; /* # of extent splits */
+ } refc;
+ struct {
+ bool active; /* allocation cursor state */
+ } abt;
+ };
+};
+
+/* Btree-in-inode cursor information */
+struct xfs_btree_cur_ino {
+ struct xfs_inode *ip;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ int allocated;
+ short forksize;
+ char whichfork;
+ char flags;
+/* We are converting a delalloc reservation */
+#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
+
+/* For extent swap, ignore owner check in verifier */
+#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
+};
+
+struct xfs_btree_level {
+ /* buffer pointer */
+ struct xfs_buf *bp;
+
+ /* key/record number */
+ uint16_t ptr;
+
+ /* readahead info */
+#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */
+ uint16_t ra;
};
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
*/
-typedef struct xfs_btree_cur
+struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
- uint bc_flags; /* btree features - below */
+ struct kmem_cache *bc_cache; /* cursor cache */
+ unsigned int bc_flags; /* btree features - below */
+ xfs_btnum_t bc_btnum; /* identifies which btree type */
union xfs_btree_irec bc_rec; /* current insert/search record value */
- struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
- int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
- uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
-#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
-#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
- uint8_t bc_nlevels; /* number of levels in the tree */
- uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
- xfs_btnum_t bc_btnum; /* identifies which btree type */
- int bc_statoff; /* offset of btre stats array */
+ uint8_t bc_nlevels; /* number of levels in the tree */
+ uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ int bc_statoff; /* offset of btree stats array */
+
+ /*
+ * Short btree pointers need an agno to be able to turn the pointers
+ * into physical addresses for IO, so the btree cursor switches between
+ * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
+ * cursor.
+ */
union {
- struct { /* needed for BNO, CNT, INO */
- struct xfs_buf *agbp; /* agf/agi buffer pointer */
- xfs_agnumber_t agno; /* ag number */
- union xfs_btree_cur_private priv;
- } a;
- struct { /* needed for BMAP */
- struct xfs_inode *ip; /* pointer to our inode */
- int allocated; /* count of alloced */
- short forksize; /* fork's inode space */
- char whichfork; /* data or attr fork */
- char flags; /* flags */
-#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
-#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
- } b;
- } bc_private; /* per-btree type data */
-} xfs_btree_cur_t;
+ struct xfs_btree_cur_ag bc_ag;
+ struct xfs_btree_cur_ino bc_ino;
+ };
+
+ /* Must be at the end of the struct! */
+ struct xfs_btree_level bc_levels[];
+};
+
+/*
+ * Compute the size of a btree cursor that can handle a btree of a given
+ * height. The bc_levels array handles node and leaf blocks, so its size
+ * is exactly nlevels.
+ */
+static inline size_t
+xfs_btree_cur_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels);
+}
/* cursor flags */
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
@@ -232,7 +270,12 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
-
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata. The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING (1<<5)
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
@@ -284,7 +327,7 @@ xfs_btree_check_sptr(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int error); /* del because of error */
/*
@@ -293,8 +336,8 @@ xfs_btree_del_cursor(
*/
int /* error */
xfs_btree_dup_cursor(
- xfs_btree_cur_t *cur, /* input cursor */
- xfs_btree_cur_t **ncur);/* output cursor */
+ struct xfs_btree_cur *cur, /* input cursor */
+ struct xfs_btree_cur **ncur);/* output cursor */
/*
* Compute first and last byte offsets for the fields given.
@@ -302,7 +345,7 @@ xfs_btree_dup_cursor(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
@@ -392,13 +435,13 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
*/
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
* Helpers.
*/
-static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block)
{
return be16_to_cpu(block->bb_numrecs);
}
@@ -409,7 +452,7 @@ static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
block->bb_numrecs = cpu_to_be16(numrecs);
}
-static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
{
return be16_to_cpu(block->bb_level);
}
@@ -435,8 +478,12 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
-uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
-unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
+unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
+ unsigned long long records);
+unsigned long long xfs_btree_calc_size(const unsigned int *limits,
+ unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+ unsigned long long blocks);
/*
* Return codes for the query range iterator function are 0 to continue
@@ -446,10 +493,11 @@ unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
* code on its own.
*/
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec, void *priv);
+ const union xfs_btree_rec *rec, void *priv);
int xfs_btree_query_range(struct xfs_btree_cur *cur,
- union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
+ const union xfs_btree_irec *low_rec,
+ const union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn, void *priv);
int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
void *priv);
@@ -477,10 +525,11 @@ union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
- union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
+ const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
-bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr);
+bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr);
int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b);
@@ -491,14 +540,16 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
-int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
- union xfs_btree_irec *high, bool *exists);
+int xfs_btree_has_record(struct xfs_btree_cur *cur,
+ const union xfs_btree_irec *low,
+ const union xfs_btree_irec *high, bool *exists);
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
/* Does this cursor point to the last block in the given level? */
static inline bool
xfs_btree_islastblock(
- xfs_btree_cur_t *cur,
+ struct xfs_btree_cur *cur,
int level)
{
struct xfs_btree_block *block;
@@ -512,4 +563,44 @@ xfs_btree_islastblock(
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
}
+void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
+ struct xfs_buf **bpp);
+void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
+ int lr);
+void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
+ struct xfs_buf *bp, int level, int numrecs);
+void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ const union xfs_btree_ptr *src_ptr, int numptrs);
+void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ const union xfs_btree_key *src_key, int numkeys);
+
+static inline struct xfs_btree_cur *
+xfs_btree_alloc_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_btnum_t btnum,
+ uint8_t maxlevels,
+ struct kmem_cache *cache)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_maxlevels = maxlevels;
+ cur->bc_cache = cache;
+
+ return cur;
+}
+
+int __init xfs_btree_init_cur_caches(void);
+void xfs_btree_destroy_cur_caches(void);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
new file mode 100644
index 000000000000..dd75e208b543
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+#include "xfs_btree_staging.h"
+
+/*
+ * Staging Cursors and Fake Roots for Btrees
+ * =========================================
+ *
+ * A staging btree cursor is a special type of btree cursor that callers must
+ * use to construct a new btree index using the btree bulk loader code. The
+ * bulk loading code uses the staging btree cursor to abstract the details of
+ * initializing new btree blocks and filling them with records or key/ptr
+ * pairs. Regular btree operations (e.g. queries and modifications) are not
+ * supported with staging cursors, and callers must not invoke them.
+ *
+ * Fake root structures contain all the information about a btree that is under
+ * construction by the bulk loading code. Staging btree cursors point to fake
+ * root structures instead of the usual AG header or inode structure.
+ *
+ * Callers are expected to initialize a fake root structure and pass it into
+ * the _stage_cursor function for a specific btree type. When bulk loading is
+ * complete, callers should call the _commit_staged_btree function for that
+ * specific btree type to commit the new btree into the filesystem.
+ */
+
+/*
+ * Don't allow staging cursors to be duplicated because they're supposed to be
+ * kept private to a single thread.
+ */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(0);
+ return NULL;
+}
+
+/*
+ * Don't allow block allocation for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Don't allow block freeing for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ */
+STATIC int
+xfs_btree_fakeroot_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xbtree_afakeroot *afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ afake = cur->bc_ag.afake;
+ ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/*
+ * Bulk Loading for AG Btrees
+ * ==========================
+ *
+ * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
+ * staging cursor. Callers should initialize this to zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_afakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/* Update the btree root information for a per-AG fake root. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ afake->af_root = be32_to_cpu(ptr->s);
+ afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.
+ * The btree cursor's bc_ops will be overridden as needed to make the staging
+ * functionality work.
+ */
+void
+xfs_btree_stage_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->set_root = xfs_btree_afakeroot_set_root;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ag.afake = afake;
+ cur->bc_nlevels = afake->af_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_afakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.agbp = agbp;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading for Inode-Rooted Btrees
+ * ====================================
+ *
+ * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
+ * the staging cursor. This structure should be initialized as follows:
+ *
+ * - if_fork_size field should be set to the number of bytes available to the
+ * fork in the inode.
+ *
+ * - if_fork should point to a freshly allocated struct xfs_ifork.
+ *
+ * - if_format should be set to the appropriate fork type (e.g.
+ * XFS_DINODE_FMT_BTREE).
+ *
+ * All other fields must be zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_ifakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/*
+ * Initialize an inode-rooted btree cursor with the given inode btree fake
+ * root. The btree cursor's bc_ops will be overridden as needed to make the
+ * staging functionality work. If new_ops is not NULL, these new ops will be
+ * passed out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ino.ifake = ifake;
+ cur->bc_nlevels = ifake->if_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+
+ if (new_ops)
+ *new_ops = nops;
+}
+
+/*
+ * Transform an inode-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_ifakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ino.ifake = NULL;
+ cur->bc_ino.whichfork = whichfork;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading of Staged Btrees
+ * =============================
+ *
+ * This interface is used with a staged btree cursor to create a totally new
+ * btree with a large number of records (i.e. more than what would fit in a
+ * single root block). When the creation is complete, the new root can be
+ * linked atomically into the filesystem by committing the staged cursor.
+ *
+ * Creation of a new btree proceeds roughly as follows:
+ *
+ * The first step is to initialize an appropriate fake btree root structure and
+ * then construct a staged btree cursor. Refer to the block comments about
+ * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
+ * more information about how to do this.
+ *
+ * The second step is to initialize a struct xfs_btree_bload context as
+ * documented in the structure definition.
+ *
+ * The third step is to call xfs_btree_bload_compute_geometry to compute the
+ * height of and the number of blocks needed to construct the btree. See the
+ * section "Computing the Geometry of the New Btree" for details about this
+ * computation.
+ *
+ * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
+ * save them for later use by ->claim_block(). Bulk loading requires all
+ * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
+ * rebuild, and to minimize seek distances of the new btree.
+ *
+ * Step five is to call xfs_btree_bload() to start constructing the btree.
+ *
+ * The final step is to commit the staging btree cursor, which logs the new
+ * btree root and turns the staging cursor into a regular cursor. The caller
+ * is responsible for cleaning up the previous btree blocks, if any.
+ *
+ * Computing the Geometry of the New Btree
+ * =======================================
+ *
+ * The number of items placed in each btree block is computed via the following
+ * algorithm: For leaf levels, the number of items for the level is nr_records
+ * in the bload structure. For node levels, the number of items for the level
+ * is the number of blocks in the next lower level of the tree. For each
+ * level, the desired number of items per block is defined as:
+ *
+ * desired = max(minrecs, maxrecs - slack factor)
+ *
+ * The number of blocks for the level is defined to be:
+ *
+ * blocks = floor(nr_items / desired)
+ *
+ * Note this is rounded down so that the npb calculation below will never fall
+ * below minrecs. The number of items that will actually be loaded into each
+ * btree block is defined as:
+ *
+ * npb = nr_items / blocks
+ *
+ * Some of the leftmost blocks in the level will contain one extra record as
+ * needed to handle uneven division. If the number of records in any block
+ * would exceed maxrecs for that level, blocks is incremented and npb is
+ * recalculated.
+ *
+ * In other words, we compute the number of blocks needed to satisfy a given
+ * loading level, then spread the items as evenly as possible.
+ *
+ * The height and number of fs blocks required to create the btree are computed
+ * and returned via btree_height and nr_blocks.
+ */
+
+/*
+ * Put a btree block that we're loading onto the ordered list and release it.
+ * The btree blocks will be written to disk when bulk loading is finished.
+ */
+static void
+xfs_btree_bload_drop_buf(
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
+{
+ if (*bpp == NULL)
+ return;
+
+ if (!xfs_buf_delwri_queue(*bpp, buffers_list))
+ ASSERT(0);
+
+ xfs_buf_relse(*bpp);
+ *bpp = NULL;
+}
+
+/*
+ * Allocate and initialize one btree block for bulk loading.
+ *
+ * The new btree block will have its level and numrecs fields set to the values
+ * of the level and nr_this_block parameters, respectively.
+ *
+ * The caller should ensure that ptrp, bpp, and blockp refer to the left
+ * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
+ * will all point to the new block.
+ */
+STATIC int
+xfs_btree_bload_prep_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ unsigned int level,
+ unsigned int nr_this_block,
+ union xfs_btree_ptr *ptrp, /* in/out */
+ struct xfs_buf **bpp, /* in/out */
+ struct xfs_btree_block **blockp, /* in/out */
+ void *priv)
+{
+ union xfs_btree_ptr new_ptr;
+ struct xfs_buf *new_bp;
+ struct xfs_btree_block *new_block;
+ int ret;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ size_t new_size;
+
+ ASSERT(*bpp == NULL);
+
+ /* Allocate a new incore btree root block. */
+ new_size = bbl->iroot_size(cur, nr_this_block, priv);
+ ifp->if_broot = kmem_zalloc(new_size, 0);
+ ifp->if_broot_bytes = (int)new_size;
+
+ /* Initialize it and send it out. */
+ xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
+ XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
+ nr_this_block, cur->bc_ino.ip->i_ino,
+ cur->bc_flags);
+
+ *bpp = NULL;
+ *blockp = ifp->if_broot;
+ xfs_btree_set_ptr_null(cur, ptrp);
+ return 0;
+ }
+
+ /* Claim one of the caller's preallocated blocks. */
+ xfs_btree_set_ptr_null(cur, &new_ptr);
+ ret = bbl->claim_block(cur, &new_ptr, priv);
+ if (ret)
+ return ret;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
+ if (ret)
+ return ret;
+
+ /*
+ * The previous block (if any) is the left sibling of the new block,
+ * so set its right sibling pointer to the new block and drop it.
+ */
+ if (*blockp)
+ xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
+ xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ /* Initialize the new btree block. */
+ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
+ xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
+
+ /* Set the out parameters. */
+ *bpp = new_bp;
+ *blockp = new_block;
+ xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
+ return 0;
+}
+
+/* Load one leaf block. */
+STATIC int
+xfs_btree_bload_leaf(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ xfs_btree_bload_get_record_fn get_record,
+ struct xfs_btree_block *block,
+ void *priv)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the leaf block with records. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_rec *block_rec;
+
+ ret = get_record(cur, priv);
+ if (ret)
+ return ret;
+ block_rec = xfs_btree_rec_addr(cur, j, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return 0;
+}
+
+/*
+ * Load one node block with key/ptr pairs.
+ *
+ * child_ptr must point to a block within the next level down in the tree. A
+ * key/ptr entry will be created in the new node block to the block pointed to
+ * by child_ptr. On exit, child_ptr points to the next block on the child
+ * level that needs processing.
+ */
+STATIC int
+xfs_btree_bload_node(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ union xfs_btree_ptr *child_ptr,
+ struct xfs_btree_block *block)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the node block with keys and pointers. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_key child_key;
+ union xfs_btree_ptr *block_ptr;
+ union xfs_btree_key *block_key;
+ struct xfs_btree_block *child_block;
+ struct xfs_buf *child_bp;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ &child_bp);
+ if (ret)
+ return ret;
+
+ block_ptr = xfs_btree_ptr_addr(cur, j, block);
+ xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
+
+ block_key = xfs_btree_key_addr(cur, j, block);
+ xfs_btree_get_keys(cur, child_block, &child_key);
+ xfs_btree_copy_keys(cur, block_key, &child_key, 1);
+
+ xfs_btree_get_sibling(cur, child_block, child_ptr,
+ XFS_BB_RIGHTSIB);
+ xfs_buf_relse(child_bp);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the maximum number of records (or keyptrs) per block that we want to
+ * install at this level in the btree. Caller is responsible for having set
+ * @cur->bc_ino.forksize to the desired fork size, if appropriate.
+ */
+STATIC unsigned int
+xfs_btree_bload_max_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int ret;
+
+ if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
+ return cur->bc_ops->get_dmaxrecs(cur, level);
+
+ ret = cur->bc_ops->get_maxrecs(cur, level);
+ if (level == 0)
+ ret -= bbl->leaf_slack;
+ else
+ ret -= bbl->node_slack;
+ return ret;
+}
+
+/*
+ * Compute the desired number of records (or keyptrs) per block that we want to
+ * install at this level in the btree, which must be somewhere between minrecs
+ * and max_npb. The caller is free to install fewer records per block.
+ */
+STATIC unsigned int
+xfs_btree_bload_desired_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
+
+ /* Root blocks are not subject to minrecs rules. */
+ if (level == cur->bc_nlevels - 1)
+ return max(1U, npb);
+
+ return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
+}
+
+/*
+ * Compute the number of records to be stored in each block at this level and
+ * the number of blocks for this level. For leaf levels, we must populate an
+ * empty root block even if there are no records, so we have to have at least
+ * one block.
+ */
+STATIC void
+xfs_btree_bload_level_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level,
+ uint64_t nr_this_level,
+ unsigned int *avg_per_block,
+ uint64_t *blocks,
+ uint64_t *blocks_with_extra)
+{
+ uint64_t npb;
+ uint64_t dontcare;
+ unsigned int desired_npb;
+ unsigned int maxnr;
+
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
+
+ /*
+ * Compute the number of blocks we need to fill each block with the
+ * desired number of records/keyptrs per block. Because desired_npb
+ * could be minrecs, we use regular integer division (which rounds
+ * the block count down) so that in the next step the effective # of
+ * items per block will never be less than desired_npb.
+ */
+ desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
+ *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
+ *blocks = max(1ULL, *blocks);
+
+ /*
+ * Compute the number of records that we will actually put in each
+ * block, assuming that we want to spread the records evenly between
+ * the blocks. Take care that the effective # of items per block (npb)
+ * won't exceed maxrecs even for the blocks that get an extra record,
+ * since desired_npb could be maxrecs, and in the previous step we
+ * rounded the block count down.
+ */
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
+ (*blocks)++;
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ }
+
+ *avg_per_block = min_t(uint64_t, npb, nr_this_level);
+
+ trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
+ *avg_per_block, desired_npb, *blocks,
+ *blocks_with_extra);
+}
+
+/*
+ * Ensure a slack value is appropriate for the btree.
+ *
+ * If the slack value is negative, set slack so that we fill the block to
+ * halfway between minrecs and maxrecs. Make sure the slack is never so large
+ * that we can underflow minrecs.
+ */
+static void
+xfs_btree_bload_ensure_slack(
+ struct xfs_btree_cur *cur,
+ int *slack,
+ int level)
+{
+ int maxr;
+ int minr;
+
+ maxr = cur->bc_ops->get_maxrecs(cur, level);
+ minr = cur->bc_ops->get_minrecs(cur, level);
+
+ /*
+ * If slack is negative, automatically set slack so that we load the
+ * btree block approximately halfway between minrecs and maxrecs.
+ * Generally, this will net us 75% loading.
+ */
+ if (*slack < 0)
+ *slack = maxr - ((maxr + minr) >> 1);
+
+ *slack = min(*slack, maxr - minr);
+}
+
+/*
+ * Prepare a btree cursor for a bulk load operation by computing the geometry
+ * fields in bbl. Caller must ensure that the btree cursor is a staging
+ * cursor. This function can be called multiple times.
+ */
+int
+xfs_btree_bload_compute_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ uint64_t nr_records)
+{
+ uint64_t nr_blocks = 0;
+ uint64_t nr_this_level;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ /*
+ * Make sure that the slack values make sense for traditional leaf and
+ * node blocks. Inode-rooted btrees will return different minrecs and
+ * maxrecs values for the root block (bc_nlevels == level - 1). We're
+ * checking levels 0 and 1 here, so set bc_nlevels such that the btree
+ * code doesn't interpret either as the root level.
+ */
+ cur->bc_nlevels = cur->bc_maxlevels - 1;
+ xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
+ xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
+
+ bbl->nr_records = nr_this_level = nr_records;
+ for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
+ uint64_t level_blocks;
+ uint64_t dontcare64;
+ unsigned int level = cur->bc_nlevels - 1;
+ unsigned int avg_per_block;
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &level_blocks, &dontcare64);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * If all the items we want to store at this level
+ * would fit in the inode root block, then we have our
+ * btree root and are done.
+ *
+ * Note that bmap btrees forbid records in the root.
+ */
+ if (level != 0 && nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /*
+ * Otherwise, we have to store all the items for this
+ * level in traditional btree blocks and therefore need
+ * another level of btree to point to those blocks.
+ *
+ * We have to re-compute the geometry for each level of
+ * an inode-rooted btree because the geometry differs
+ * between a btree root in an inode fork and a
+ * traditional btree block.
+ *
+ * This distinction is made in the btree code based on
+ * whether level == bc_nlevels - 1. Based on the
+ * previous root block size check against the root
+ * block geometry, we know that we aren't yet ready to
+ * populate the root. Increment bc_nevels and
+ * recalculate the geometry for a traditional
+ * block-based btree level.
+ */
+ cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ xfs_btree_bload_level_geometry(cur, bbl, level,
+ nr_this_level, &avg_per_block,
+ &level_blocks, &dontcare64);
+ } else {
+ /*
+ * If all the items we want to store at this level
+ * would fit in a single root block, we're done.
+ */
+ if (nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /* Otherwise, we need another level of btree. */
+ cur->bc_nlevels++;
+ ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
+ }
+
+ nr_blocks += level_blocks;
+ nr_this_level = level_blocks;
+ }
+
+ if (cur->bc_nlevels > cur->bc_maxlevels)
+ return -EOVERFLOW;
+
+ bbl->btree_height = cur->bc_nlevels;
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ bbl->nr_blocks = nr_blocks - 1;
+ else
+ bbl->nr_blocks = nr_blocks;
+ return 0;
+}
+
+/* Bulk load a btree given the parameters and geometry established in bbl. */
+int
+xfs_btree_bload(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ void *priv)
+{
+ struct list_head buffers_list;
+ union xfs_btree_ptr child_ptr;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block = NULL;
+ uint64_t nr_this_level = bbl->nr_records;
+ uint64_t blocks;
+ uint64_t i;
+ uint64_t blocks_with_extra;
+ uint64_t total_blocks = 0;
+ unsigned int avg_per_block;
+ unsigned int level = 0;
+ int ret;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ INIT_LIST_HEAD(&buffers_list);
+ cur->bc_nlevels = bbl->btree_height;
+ xfs_btree_set_ptr_null(cur, &child_ptr);
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each leaf block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ /*
+ * Due to rounding, btree blocks will not be evenly populated
+ * in most cases. blocks_with_extra tells us how many blocks
+ * will receive an extra record to distribute the excess across
+ * the current level as evenly as possible.
+ */
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
+ nr_this_block, &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
+ nr_this_block);
+
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ block, priv);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost leaf pointer so we know where to start
+ * with the first node level.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ /* Populate the internal btree nodes. */
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ union xfs_btree_ptr first_ptr;
+
+ nr_this_level = blocks;
+ block = NULL;
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each node block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl,
+ &buffers_list, level, nr_this_block,
+ &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks,
+ &ptr, nr_this_block);
+
+ ret = xfs_btree_bload_node(cur, nr_this_block,
+ &child_ptr, block);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost node pointer so that we know
+ * where to start the next node level above this one.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+ xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
+ }
+
+ /* Initialize the new root. */
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
+ cur->bc_ino.ifake->if_blocks = total_blocks - 1;
+ } else {
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
+ cur->bc_ag.afake->af_levels = cur->bc_nlevels;
+ cur->bc_ag.afake->af_blocks = total_blocks;
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ ret = xfs_buf_delwri_submit(&buffers_list);
+ if (ret)
+ goto out;
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ ret = -EIO;
+ }
+
+out:
+ xfs_buf_delwri_cancel(&buffers_list);
+ if (bp)
+ xfs_buf_relse(bp);
+ return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
new file mode 100644
index 000000000000..f0d2976050ae
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_BTREE_STAGING_H__
+#define __XFS_BTREE_STAGING_H__
+
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+ /* AG block number of the new btree root. */
+ xfs_agblock_t af_root;
+
+ /* Height of the new btree. */
+ unsigned int af_levels;
+
+ /* Number of blocks used by the btree. */
+ unsigned int af_blocks;
+};
+
+/* Cursor interactions with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+
+/* Fake root for an inode-rooted btree. */
+struct xbtree_ifakeroot {
+ /* Fake inode fork. */
+ struct xfs_ifork *if_fork;
+
+ /* Number of blocks used by the btree. */
+ int64_t if_blocks;
+
+ /* Height of the new btree. */
+ unsigned int if_levels;
+
+ /* Number of bytes available for this fork in the inode. */
+ unsigned int if_fork_size;
+
+ /* Fork format. */
+ unsigned int if_format;
+
+ /* Number of records. */
+ unsigned int if_extents;
+};
+
+/* Cursor interactions with fake roots for inode-rooted btrees. */
+void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ int whichfork, const struct xfs_btree_ops *ops);
+
+/* Bulk loading of staged btrees. */
+typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr, void *priv);
+typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
+ unsigned int nr_this_level, void *priv);
+
+struct xfs_btree_bload {
+ /*
+ * This function will be called nr_records times to load records into
+ * the btree. The function does this by setting the cursor's bc_rec
+ * field in in-core format. Records must be returned in sort order.
+ */
+ xfs_btree_bload_get_record_fn get_record;
+
+ /*
+ * This function will be called nr_blocks times to obtain a pointer
+ * to a new btree block on disk. Callers must preallocate all space
+ * for the new btree before calling xfs_btree_bload, and this function
+ * is what claims that reservation.
+ */
+ xfs_btree_bload_claim_block_fn claim_block;
+
+ /*
+ * This function should return the size of the in-core btree root
+ * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
+ * types.
+ */
+ xfs_btree_bload_iroot_size_fn iroot_size;
+
+ /*
+ * The caller should set this to the number of records that will be
+ * stored in the new btree.
+ */
+ uint64_t nr_records;
+
+ /*
+ * Number of free records to leave in each leaf block. If the caller
+ * sets this to -1, the slack value will be calculated to be halfway
+ * between maxrecs and minrecs. This typically leaves the block 75%
+ * full. Note that slack values are not enforced on inode root blocks.
+ */
+ int leaf_slack;
+
+ /*
+ * Number of free key/ptrs pairs to leave in each node block. This
+ * field has the same semantics as leaf_slack.
+ */
+ int node_slack;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * number of btree blocks needed to store nr_records records.
+ */
+ uint64_t nr_blocks;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * height of the new btree.
+ */
+ unsigned int btree_height;
+};
+
+int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl, uint64_t nr_records);
+int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl,
+ void *priv);
+
+#endif /* __XFS_BTREE_STAGING_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 875e04f82541..e576560b46e9 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
/*
* xfs_da_btree.c
@@ -72,16 +73,22 @@ STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *save_blk);
-kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */
/*
* Allocate a dir-state structure.
* We don't put them on the stack since they're large.
*/
-xfs_da_state_t *
-xfs_da_state_alloc(void)
+struct xfs_da_state *
+xfs_da_state_alloc(
+ struct xfs_da_args *args)
{
- return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+ struct xfs_da_state *state;
+
+ state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ return state;
}
/*
@@ -107,7 +114,18 @@ xfs_da_state_free(xfs_da_state_t *state)
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
- kmem_cache_free(xfs_da_state_zone, state);
+ kmem_cache_free(xfs_da_state_cache, state);
+}
+
+void
+xfs_da_state_reset(
+ struct xfs_da_state *state,
+ struct xfs_da_args *args)
+{
+ xfs_da_state_kill_altpath(state);
+ memset(state, 0, sizeof(struct xfs_da_state));
+ state->args = args;
+ state->mp = state->args->dp->i_mount;
}
static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
@@ -123,7 +141,7 @@ xfs_da3_node_hdr_from_disk(
struct xfs_da3_icnode_hdr *to,
struct xfs_da_intnode *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from;
to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
@@ -150,7 +168,7 @@ xfs_da3_node_hdr_to_disk(
struct xfs_da_intnode *to,
struct xfs_da3_icnode_hdr *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to;
ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
@@ -185,10 +203,10 @@ xfs_da3_blkinfo_verify(
if (!xfs_verify_magic16(bp, hdr->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -247,7 +265,7 @@ xfs_da3_node_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -276,7 +294,7 @@ xfs_da3_node_read_verify(
__this_address);
break;
}
- /* fall through */
+ fallthrough;
case XFS_DA_NODE_MAGIC:
fa = xfs_da3_node_verify(bp);
if (fa)
@@ -436,12 +454,12 @@ xfs_da3_node_create(
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
node = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
- hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
@@ -476,6 +494,9 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
+ if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ return -EIO;
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
@@ -590,7 +611,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
- xfs_buf_corruption_error(oldblk->bp);
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -603,7 +624,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
- xfs_buf_corruption_error(oldblk->bp);
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -705,7 +726,7 @@ xfs_da3_root_split(
oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
- node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+ node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
}
xfs_trans_log_buf(tp, bp, 0, size - 1);
@@ -858,7 +879,6 @@ xfs_da3_node_rebalance(
{
struct xfs_da_intnode *node1;
struct xfs_da_intnode *node2;
- struct xfs_da_intnode *tmpnode;
struct xfs_da_node_entry *btree1;
struct xfs_da_node_entry *btree2;
struct xfs_da_node_entry *btree_s;
@@ -888,9 +908,7 @@ xfs_da3_node_rebalance(
((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
(be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
- tmpnode = node1;
- node1 = node2;
- node2 = tmpnode;
+ swap(node1, node2);
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
btree1 = nodehdr1.btree;
@@ -1213,7 +1231,7 @@ xfs_da3_root_join(
xfs_trans_buf_copy_type(root_blk->bp, bp);
if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
- da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
}
xfs_trans_log_buf(args->trans, root_blk->bp, 0,
args->geo->blksize - 1);
@@ -1624,7 +1642,7 @@ xfs_da3_node_lookup_int(
}
if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
- xfs_buf_corruption_error(blk->bp);
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
}
@@ -1639,7 +1657,7 @@ xfs_da3_node_lookup_int(
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
- xfs_buf_corruption_error(blk->bp);
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
}
@@ -1647,7 +1665,7 @@ xfs_da3_node_lookup_int(
if (blkno == args->geo->leafblk)
expected_level = nodehdr.level - 1;
else if (expected_level != nodehdr.level) {
- xfs_buf_corruption_error(blk->bp);
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
} else
expected_level--;
@@ -1986,7 +2004,8 @@ xfs_da3_path_shift(
ASSERT(path != NULL);
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
- for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+ for (; level >= 0; level--) {
+ blk = &path->blk[level];
xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
blk->bp->b_addr);
@@ -2138,7 +2157,7 @@ xfs_da_grow_inode_int(
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
int w = args->whichfork;
- xfs_rfsblock_t nblks = dp->i_d.di_nblocks;
+ xfs_rfsblock_t nblks = dp->i_nblocks;
struct xfs_bmbt_irec map, *mapp;
int nmap, error, got, i, mapi;
@@ -2173,8 +2192,8 @@ xfs_da_grow_inode_int(
*/
mapp = kmem_alloc(sizeof(*mapp) * count, 0);
for (b = *bno, mapi = 0; b < *bno + count; ) {
- nmap = min(XFS_BMAP_MAX_NMAP, count);
c = (int)(*bno + count - b);
+ nmap = min(XFS_BMAP_MAX_NMAP, c);
error = xfs_bmapi_write(tp, dp, b, c,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
args->total, &mapp[mapi], &nmap);
@@ -2204,7 +2223,7 @@ xfs_da_grow_inode_int(
}
/* account for newly allocated blocks in reserved blocks total */
- args->total -= dp->i_d.di_nblocks - nblks;
+ args->total -= dp->i_nblocks - nblks;
out_free_map:
if (mapp != &map)
@@ -2520,8 +2539,10 @@ xfs_dabuf_map(
*/
if (nirecs > 1) {
map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
- if (!map)
+ if (!map) {
+ error = -ENOMEM;
goto out_free_irecs;
+ }
*mapp = map;
}
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 0f4fbb0889ff..ffa3df5b2893 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
@@ -9,7 +9,6 @@
struct xfs_inode;
struct xfs_trans;
-struct zone;
/*
* Directory/attribute geometry information. There will be one of these for each
@@ -31,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+ xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
@@ -57,9 +57,10 @@ typedef struct xfs_da_args {
const uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
uint8_t filetype; /* filetype of inode for directories */
- uint8_t *value; /* set of bytes (maybe contain NULLs) */
+ void *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
- int flags; /* argument flags (eg: ATTR_NOCREATE) */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+ unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
xfs_dahash_t hashval; /* hash value of name */
xfs_ino_t inumber; /* input/output inode number */
struct xfs_inode *dp; /* directory inode to manipulate */
@@ -76,29 +77,33 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- int op_flags; /* operation flags */
+ uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
* Operation flags:
*/
-#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
-#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
-#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
-#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */
+#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
- { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_REPLACE, "REPLACE" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \
- { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }, \
+ { XFS_DA_OP_REMOVE, "REMOVE" }, \
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \
+ { XFS_DA_OP_LOGGED, "LOGGED" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -199,7 +204,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
* Utility routines.
*/
-#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
@@ -220,14 +225,15 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
-xfs_da_state_t *xfs_da_state_alloc(void);
+struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
-extern struct kmem_zone *xfs_da_state_zone;
+extern struct kmem_cache *xfs_da_state_cache;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 734837a9b51a..25e2841084e1 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
@@ -15,8 +15,8 @@
*/
#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
typedef struct xfs_da_blkinfo {
__be32 forw; /* previous block in list */
@@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo {
*/
#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
-#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */
struct xfs_da3_blkinfo {
/*
@@ -61,7 +61,7 @@ struct xfs_da3_blkinfo {
* Since we have duplicate keys, use a binary search but always follow
* all match in the block, not just the first match found.
*/
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
typedef struct xfs_da_node_hdr {
struct xfs_da_blkinfo info; /* block type, links, etc. */
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
+#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
@@ -579,7 +580,7 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
/*
* Entries are packed toward the top as tight as possible.
*/
-typedef struct xfs_attr_shortform {
+struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
__be16 totsize; /* total bytes in shortform list */
__u8 count; /* count of active entries */
@@ -589,9 +590,9 @@ typedef struct xfs_attr_shortform {
uint8_t namelen; /* actual length of name (no NULL) */
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[1]; /* name & value bytes concatenated */
+ uint8_t nameval[]; /* name & value bytes concatenated */
} list[1]; /* variable sized array */
-} xfs_attr_shortform_t;
+};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
@@ -688,23 +689,11 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
- ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
- ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
/*
* Alignment for namelist and valuelist entries (since they are mixed
@@ -758,14 +747,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
+ nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
+ nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
@@ -801,7 +790,7 @@ struct xfs_attr3_rmt_hdr {
#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ ((bufsize) - (xfs_has_crc((mp)) ? \
sizeof(struct xfs_attr3_rmt_hdr) : 0))
/* Number of bytes in a directory block. */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 22557527cfdb..5a321b783398 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -16,6 +16,18 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+
+static struct kmem_cache *xfs_defer_pending_cache;
/*
* Deferred Operations in XFS
@@ -176,29 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
[XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
};
/*
+ * Ensure there's a log intent item associated with this deferred work item if
+ * the operation must be restarted on crash. Returns 1 if there's a log item;
+ * 0 if there isn't; or a negative errno.
+ */
+static int
+xfs_defer_create_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp,
+ bool sort)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_log_item *lip;
+
+ if (dfp->dfp_intent)
+ return 1;
+
+ lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ if (!lip)
+ return 0;
+ if (IS_ERR(lip))
+ return PTR_ERR(lip);
+
+ dfp->dfp_intent = lip;
+ return 1;
+}
+
+/*
* For each pending item in the intake list, log its intent item and the
* associated extents, then add the entire intake list to the end of
* the pending list.
+ *
+ * Returns 1 if at least one log item was associated with the deferred work;
+ * 0 if there are no log items; or a negative errno.
*/
-STATIC void
+static int
xfs_defer_create_intents(
struct xfs_trans *tp)
{
- struct list_head *li;
struct xfs_defer_pending *dfp;
- const struct xfs_defer_op_type *ops;
+ int ret = 0;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
+ int ret2;
+
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
- list_for_each(li, &dfp->dfp_work)
- ops->log_item(tp, dfp->dfp_intent, li);
+ ret2 = xfs_defer_create_intent(tp, dfp, true);
+ if (ret2 < 0)
+ return ret2;
+ ret |= ret2;
}
+ return ret;
}
/* Abort all the intents that were committed. */
@@ -223,20 +267,20 @@ xfs_defer_trans_abort(
}
}
-/* Roll a transaction so we can do some deferred op processing. */
-STATIC int
-xfs_defer_trans_roll(
- struct xfs_trans **tpp)
+/*
+ * Capture resources that the caller said not to release ("held") when the
+ * transaction commits. Caller is responsible for zero-initializing @dres.
+ */
+static int
+xfs_defer_save_resources(
+ struct xfs_defer_resources *dres,
+ struct xfs_trans *tp)
{
- struct xfs_trans *tp = *tpp;
struct xfs_buf_log_item *bli;
struct xfs_inode_log_item *ili;
struct xfs_log_item *lip;
- struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS];
- struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES];
- int bpcount = 0, ipcount = 0;
- int i;
- int error;
+
+ BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
list_for_each_entry(lip, &tp->t_items, li_trans) {
switch (lip->li_type) {
@@ -244,25 +288,29 @@ xfs_defer_trans_roll(
bli = container_of(lip, struct xfs_buf_log_item,
bli_item);
if (bli->bli_flags & XFS_BLI_HOLD) {
- if (bpcount >= XFS_DEFER_OPS_NR_BUFS) {
+ if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
ASSERT(0);
return -EFSCORRUPTED;
}
- xfs_trans_dirty_buf(tp, bli->bli_buf);
- bplist[bpcount++] = bli->bli_buf;
+ if (bli->bli_flags & XFS_BLI_ORDERED)
+ dres->dr_ordered |=
+ (1U << dres->dr_bufs);
+ else
+ xfs_trans_dirty_buf(tp, bli->bli_buf);
+ dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
}
break;
case XFS_LI_INODE:
ili = container_of(lip, struct xfs_inode_log_item,
ili_item);
if (ili->ili_lock_flags == 0) {
- if (ipcount >= XFS_DEFER_OPS_NR_INODES) {
+ if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
ASSERT(0);
return -EFSCORRUPTED;
}
xfs_trans_log_inode(tp, ili->ili_inode,
XFS_ILOG_CORE);
- iplist[ipcount++] = ili->ili_inode;
+ dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
}
break;
default:
@@ -270,7 +318,43 @@ xfs_defer_trans_roll(
}
}
- trace_xfs_defer_trans_roll(tp, _RET_IP_);
+ return 0;
+}
+
+/* Attach the held resources to the transaction. */
+static void
+xfs_defer_restore_resources(
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
+
+ /* Rejoin the joined inodes. */
+ for (i = 0; i < dres->dr_inos; i++)
+ xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
+
+ /* Rejoin the buffers and dirty them so the log moves forward. */
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_trans_bjoin(tp, dres->dr_bp[i]);
+ if (dres->dr_ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
+ xfs_trans_bhold(tp, dres->dr_bp[i]);
+ }
+}
+
+/* Roll a transaction so we can do some deferred op processing. */
+STATIC int
+xfs_defer_trans_roll(
+ struct xfs_trans **tpp)
+{
+ struct xfs_defer_resources dres = { };
+ int error;
+
+ error = xfs_defer_save_resources(&dres, *tpp);
+ if (error)
+ return error;
+
+ trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
/*
* Roll the transaction. Rolling always given a new transaction (even
@@ -280,40 +364,15 @@ xfs_defer_trans_roll(
* happened.
*/
error = xfs_trans_roll(tpp);
- tp = *tpp;
- /* Rejoin the joined inodes. */
- for (i = 0; i < ipcount; i++)
- xfs_trans_ijoin(tp, iplist[i], 0);
-
- /* Rejoin the buffers and dirty them so the log moves forward. */
- for (i = 0; i < bpcount; i++) {
- xfs_trans_bjoin(tp, bplist[i]);
- xfs_trans_bhold(tp, bplist[i]);
- }
+ xfs_defer_restore_resources(*tpp, &dres);
if (error)
- trace_xfs_defer_trans_roll_error(tp, error);
+ trace_xfs_defer_trans_roll_error(*tpp, error);
return error;
}
/*
- * Reset an already used dfops after finish.
- */
-static void
-xfs_defer_reset(
- struct xfs_trans *tp)
-{
- ASSERT(list_empty(&tp->t_dfops));
-
- /*
- * Low mode state transfers across transaction rolls to mirror dfops
- * lifetime. Clear it now that dfops is reset.
- */
- tp->t_flags &= ~XFS_TRANS_LOWMODE;
-}
-
-/*
* Free up any items left in the list.
*/
static void
@@ -341,8 +400,112 @@ xfs_defer_cancel_list(
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
- kmem_free(dfp);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+ }
+}
+
+/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+{
+ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
+ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
+
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ list_for_each_entry(dfp, dfops, dfp_list) {
+ /*
+ * If the log intent item for this deferred op is not a part of
+ * the current log checkpoint, relog the intent item to keep
+ * the log tail moving forward. We're ok with this being racy
+ * because an incorrect decision means we'll be a little slower
+ * at pushing the tail.
+ */
+ if (dfp->dfp_intent == NULL ||
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
+ /*
+ * Figure out where we need the tail to be in order to maintain
+ * the minimum required free space in the log. Only sample
+ * the log threshold once per call.
+ */
+ if (threshold_lsn == NULLCOMMITLSN) {
+ threshold_lsn = xlog_grant_push_threshold(log, 0);
+ if (threshold_lsn == NULLCOMMITLSN)
+ break;
+ }
+ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+ continue;
+
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+ }
+
+ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+ return xfs_defer_trans_roll(tpp);
+ return 0;
+}
+
+/*
+ * Log an intent-done item for the first pending intent, and finish the work
+ * items.
+ */
+static int
+xfs_defer_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_btree_cur *state = NULL;
+ struct list_head *li, *n;
+ int error;
+
+ trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
+
+ dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ list_for_each_safe(li, n, &dfp->dfp_work) {
+ list_del(li);
+ dfp->dfp_count--;
+ error = ops->finish_item(tp, dfp->dfp_done, li, &state);
+ if (error == -EAGAIN) {
+ int ret;
+
+ /*
+ * Caller wants a fresh transaction; put the work item
+ * back on the list and log a new log intent item to
+ * replace the old one. See "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" above.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = NULL;
+ ret = xfs_defer_create_intent(tp, dfp, false);
+ if (ret < 0)
+ error = ret;
+ }
+
+ if (error)
+ goto out;
}
+
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+out:
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(tp, state, error);
+ return error;
}
/*
@@ -357,12 +520,8 @@ int
xfs_defer_finish_noroll(
struct xfs_trans **tp)
{
- struct xfs_defer_pending *dfp;
- struct list_head *li;
- struct list_head *n;
- void *state;
+ struct xfs_defer_pending *dfp = NULL;
int error = 0;
- const struct xfs_defer_op_type *ops;
LIST_HEAD(dop_pending);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -371,87 +530,51 @@ xfs_defer_finish_noroll(
/* Until we run out of pending work to finish... */
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
- /* log intents and pull in intake items */
- xfs_defer_create_intents(*tp);
- list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
-
/*
- * Roll the transaction.
+ * Deferred items that are created in the process of finishing
+ * other deferred work items should be queued at the head of
+ * the pending list, which puts them ahead of the deferred work
+ * that was created by the caller. This keeps the number of
+ * pending work items to a minimum, which decreases the amount
+ * of time that any one intent item can stick around in memory,
+ * pinning the log tail.
*/
- error = xfs_defer_trans_roll(tp);
- if (error)
- goto out;
+ int has_intents = xfs_defer_create_intents(*tp);
- /* Log an intent-done item for the first pending item. */
- dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
- dfp_list);
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
- dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
- dfp->dfp_count);
-
- /* Finish the work items. */
- state = NULL;
- list_for_each_safe(li, n, &dfp->dfp_work) {
- list_del(li);
- dfp->dfp_count--;
- error = ops->finish_item(*tp, li, dfp->dfp_done,
- &state);
- if (error == -EAGAIN) {
- /*
- * Caller wants a fresh transaction;
- * put the work item back on the list
- * and jump out.
- */
- list_add(li, &dfp->dfp_work);
- dfp->dfp_count++;
- break;
- } else if (error) {
- /*
- * Clean up after ourselves and jump out.
- * xfs_defer_cancel will take care of freeing
- * all these lists and stuff.
- */
- if (ops->finish_cleanup)
- ops->finish_cleanup(*tp, state, error);
- goto out;
- }
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
+
+ if (has_intents < 0) {
+ error = has_intents;
+ goto out_shutdown;
}
- if (error == -EAGAIN) {
- /*
- * Caller wants a fresh transaction, so log a
- * new log intent item to replace the old one
- * and roll the transaction. See "Requesting
- * a Fresh Transaction while Finishing
- * Deferred Work" above.
- */
- dfp->dfp_intent = ops->create_intent(*tp,
- dfp->dfp_count);
- dfp->dfp_done = NULL;
- list_for_each(li, &dfp->dfp_work)
- ops->log_item(*tp, dfp->dfp_intent, li);
- } else {
- /* Done with the dfp, free it. */
- list_del(&dfp->dfp_list);
- kmem_free(dfp);
+ if (has_intents || dfp) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+
+ /* Relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
}
- if (ops->finish_cleanup)
- ops->finish_cleanup(*tp, state, error);
- }
-
-out:
- if (error) {
- xfs_defer_trans_abort(*tp, &dop_pending);
- xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
- trace_xfs_defer_finish_error(*tp, error);
- xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
- xfs_defer_cancel(*tp);
- return error;
+ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+ dfp_list);
+ error = xfs_defer_finish_one(*tp, dfp);
+ if (error && error != -EAGAIN)
+ goto out_shutdown;
}
trace_xfs_defer_finish_done(*tp, _RET_IP_);
return 0;
+
+out_shutdown:
+ xfs_defer_trans_abort(*tp, &dop_pending);
+ xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ trace_xfs_defer_finish_error(*tp, error);
+ xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+ xfs_defer_cancel(*tp);
+ return error;
}
int
@@ -475,7 +598,10 @@ xfs_defer_finish(
return error;
}
}
- xfs_defer_reset(*tp);
+
+ /* Reset LOWMODE now that we've finished all the dfops. */
+ ASSERT(list_empty(&(*tp)->t_dfops));
+ (*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -516,8 +642,8 @@ xfs_defer_add(
dfp = NULL;
}
if (!dfp) {
- dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
- KM_NOFS);
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
@@ -549,6 +675,256 @@ xfs_defer_move(
* that behavior.
*/
dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
+ stp->t_flags &= ~XFS_TRANS_LOWMODE;
+}
+
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later. Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off. The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+ struct xfs_trans *tp)
+{
+ struct xfs_defer_capture *dfc;
+ unsigned short i;
+ int error;
+
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ error = xfs_defer_create_intents(tp);
+ if (error < 0)
+ return ERR_PTR(error);
+
+ /* Create an object to capture the defer ops. */
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ /* Move the dfops chain and transaction state to the capture struct. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+ /* Capture the remaining block reservations along with the dfops. */
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
+ error = xfs_defer_save_resources(&dfc->dfc_held, tp);
+ if (error) {
+ /*
+ * Resource capture should never fail, but if it does, we
+ * still have to shut down the log and release things
+ * properly.
+ */
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ }
+
+ /*
+ * Grab extra references to the inodes and buffers because callers are
+ * expected to release their held references after we commit the
+ * transaction.
+ */
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
+ ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+ ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
+ }
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
+
+ return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_capture_free(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ unsigned short i;
+
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
+
+ for (i = 0; i < dfc->dfc_held.dr_inos; i++)
+ xfs_irele(dfc->dfc_held.dr_ip[i]);
+
+ kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log. If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure. The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
+ struct list_head *capture_list)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
+ /* If we don't capture anything, commit transaction and exit. */
+ dfc = xfs_defer_ops_capture(tp);
+ if (IS_ERR(dfc)) {
+ xfs_trans_cancel(tp);
+ return PTR_ERR(dfc);
+ }
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+ /* Commit the transaction and add the capture structure to the list. */
+ error = xfs_trans_commit(tp);
+ if (error) {
+ xfs_defer_ops_capture_free(mp, dfc);
+ return error;
+ }
+
+ list_add_tail(&dfc->dfc_list, capture_list);
+ return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure. If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp,
+ struct xfs_defer_resources *dres)
+{
+ unsigned int i;
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Lock the captured resources to the new transaction. */
+ if (dfc->dfc_held.dr_inos == 2)
+ xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
+ dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
+ else if (dfc->dfc_held.dr_inos == 1)
+ xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+ /* Join the captured resources to the new transaction. */
+ xfs_defer_restore_resources(tp, &dfc->dfc_held);
+ memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+ dres->dr_bufs = 0;
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
+
+ kmem_free(dfc);
+}
+
+/* Release the resources captured and continued during recovery. */
+void
+xfs_defer_resources_rele(
+ struct xfs_defer_resources *dres)
+{
+ unsigned short i;
- xfs_defer_reset(stp);
+ for (i = 0; i < dres->dr_inos; i++) {
+ xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
+ xfs_irele(dres->dr_ip[i]);
+ dres->dr_ip[i] = NULL;
+ }
+
+ for (i = 0; i < dres->dr_bufs; i++) {
+ xfs_buf_relse(dres->dr_bp[i]);
+ dres->dr_bp[i] = NULL;
+ }
+
+ dres->dr_inos = 0;
+ dres->dr_bufs = 0;
+ dres->dr_ordered = 0;
+}
+
+static inline int __init
+xfs_defer_init_cache(void)
+{
+ xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
+ sizeof(struct xfs_defer_pending),
+ 0, 0, NULL);
+
+ return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
+}
+
+static inline void
+xfs_defer_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_defer_pending_cache);
+ xfs_defer_pending_cache = NULL;
+}
+
+/* Set up caches for deferred work items. */
+int __init
+xfs_defer_init_item_caches(void)
+{
+ int error;
+
+ error = xfs_defer_init_cache();
+ if (error)
+ return error;
+ error = xfs_rmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_refcount_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_bmap_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_extfree_intent_init_cache();
+ if (error)
+ goto err;
+ error = xfs_attr_intent_init_cache();
+ if (error)
+ goto err;
+ return 0;
+err:
+ xfs_defer_destroy_item_caches();
+ return error;
+}
+
+/* Destroy all the deferred work item caches, if they've been allocated. */
+void
+xfs_defer_destroy_item_caches(void)
+{
+ xfs_attr_intent_destroy_cache();
+ xfs_extfree_intent_destroy_cache();
+ xfs_bmap_intent_destroy_cache();
+ xfs_refcount_intent_destroy_cache();
+ xfs_rmap_intent_destroy_cache();
+ xfs_defer_destroy_cache();
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7c28d7608ac6..114a3a4930a3 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
@@ -6,7 +6,9 @@
#ifndef __XFS_DEFER_H__
#define __XFS_DEFER_H__
+struct xfs_btree_cur;
struct xfs_defer_op_type;
+struct xfs_defer_capture;
/*
* Header for deferred operation list.
@@ -17,6 +19,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_ATTR,
XFS_DEFER_OPS_TYPE_MAX,
};
@@ -28,8 +31,8 @@ enum xfs_defer_ops_type {
struct xfs_defer_pending {
struct list_head dfp_list; /* pending items */
struct list_head dfp_work; /* work items */
- void *dfp_intent; /* log intent item */
- void *dfp_done; /* log done item */
+ struct xfs_log_item *dfp_intent; /* log intent item */
+ struct xfs_log_item *dfp_done; /* log done item */
unsigned int dfp_count; /* # extent items */
enum xfs_defer_ops_type dfp_type;
};
@@ -43,15 +46,16 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
- void (*abort_intent)(void *);
- void *(*create_done)(struct xfs_trans *, void *, unsigned int);
- int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
- void **);
- void (*finish_cleanup)(struct xfs_trans *, void *, int);
- void (*cancel_item)(struct list_head *);
- int (*diff_items)(void *, struct list_head *, struct list_head *);
- void *(*create_intent)(struct xfs_trans *, uint);
- void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+ struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
+ struct list_head *items, unsigned int count, bool sort);
+ void (*abort_intent)(struct xfs_log_item *intent);
+ struct xfs_log_item *(*create_done)(struct xfs_trans *tp,
+ struct xfs_log_item *intent, unsigned int count);
+ int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done,
+ struct list_head *item, struct xfs_btree_cur **state);
+ void (*finish_cleanup)(struct xfs_trans *tp,
+ struct xfs_btree_cur *state, int error);
+ void (*cancel_item)(struct list_head *item);
unsigned int max_items;
};
@@ -60,5 +64,68 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
+
+/*
+ * Deferred operation item relogging limits.
+ */
+#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
+#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
+
+/* Resources that must be held across a transaction roll. */
+struct xfs_defer_resources {
+ /* held buffers */
+ struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS];
+
+ /* inodes with no unlock flags */
+ struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES];
+
+ /* number of held buffers */
+ unsigned short dr_bufs;
+
+ /* bitmap of ordered buffers */
+ unsigned short dr_ordered;
+
+ /* number of held inodes */
+ unsigned short dr_inos;
+};
+
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other capture structures. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
+
+ struct xfs_defer_resources dfc_held;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+ struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+ struct xfs_defer_resources *dres);
+void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+ struct xfs_defer_capture *d);
+void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+
+int __init xfs_defer_init_item_caches(void);
+void xfs_defer_destroy_item_caches(void);
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index dd6fcaaea318..92bac3373f1f 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -19,7 +19,11 @@
#include "xfs_error.h"
#include "xfs_trace.h"
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+const struct xfs_name xfs_name_dotdot = {
+ .name = (const unsigned char *)"..",
+ .len = 2,
+ .type = XFS_DIR3_FT_DIR,
+};
/*
* Convert inode mode to directory entry filetype
@@ -54,10 +58,10 @@ xfs_mode_to_ftype(
*/
xfs_dahash_t
xfs_ascii_ci_hashname(
- struct xfs_name *name)
+ const struct xfs_name *name)
{
- xfs_dahash_t hash;
- int i;
+ xfs_dahash_t hash;
+ int i;
for (i = 0, hash = 0; i < name->len; i++)
hash = tolower(name->name[i]) ^ rol32(hash, 7);
@@ -115,7 +119,7 @@ xfs_da_mount(
dageo->fsblog = mp->m_sb.sb_blocklog;
dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb);
dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr);
dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr);
dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr);
@@ -146,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+ mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;
/* set up attribute geometry - single fsb only */
@@ -157,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+
+ if (xfs_has_large_extent_counts(mp))
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ else
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
@@ -179,9 +191,9 @@ xfs_dir_isempty(
xfs_dir2_sf_hdr_t *sfp;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
- if (dp->i_d.di_size == 0) /* might happen during shutdown. */
+ if (dp->i_disk_size == 0) /* might happen during shutdown. */
return 1;
- if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+ if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
return 0;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
return !sfp->count;
@@ -243,13 +255,13 @@ int
xfs_dir_createname(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name,
+ const struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -278,7 +290,7 @@ xfs_dir_createname(
if (!inum)
args->op_flags |= XFS_DA_OP_JUSTCHECK;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_addname(args);
goto out_free;
}
@@ -337,16 +349,16 @@ xfs_dir_cilookup_result(
int
xfs_dir_lookup(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_ino_t *inum, /* out: inode number */
- struct xfs_name *ci_name) /* out: actual name if CI match */
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
{
- struct xfs_da_args *args;
- int rval;
- int v; /* type-checking value */
- int lock_mode;
+ struct xfs_da_args *args;
+ int rval;
+ bool v;
+ int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
@@ -373,7 +385,7 @@ xfs_dir_lookup(
args->op_flags |= XFS_DA_OP_CILOOKUP;
lock_mode = xfs_ilock_data_map_shared(dp);
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_lookup(args);
goto out_check_rval;
}
@@ -423,7 +435,7 @@ xfs_dir_removename(
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
@@ -443,7 +455,7 @@ xfs_dir_removename(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_removename(args);
goto out_free;
}
@@ -475,13 +487,13 @@ int
xfs_dir_replace(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name, /* name of entry to replace */
+ const struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_extlen_t total) /* bmap's total block count */
{
struct xfs_da_args *args;
int rval;
- int v; /* type-checking value */
+ bool v;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
@@ -504,7 +516,7 @@ xfs_dir_replace(
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_replace(args);
goto out_free;
}
@@ -584,8 +596,8 @@ xfs_dir2_grow_inode(
xfs_fsize_t size; /* directory file (data) size */
size = XFS_FSB_TO_B(mp, bno + count);
- if (size > dp->i_d.di_size) {
- dp->i_d.di_size = size;
+ if (size > dp->i_disk_size) {
+ dp->i_disk_size = size;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
}
}
@@ -598,19 +610,23 @@ xfs_dir2_grow_inode(
int
xfs_dir2_isblock(
struct xfs_da_args *args,
- int *vp) /* out: 1 is block, 0 is not block */
+ bool *isblock)
{
- xfs_fileoff_t last; /* last file offset */
- int rval;
+ struct xfs_mount *mp = args->dp->i_mount;
+ xfs_fileoff_t eof;
+ int error;
- if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
- return rval;
- rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- if (XFS_IS_CORRUPT(args->dp->i_mount,
- rval != 0 &&
- args->dp->i_d.di_size != args->geo->blksize))
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isblock = false;
+ if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize)
+ return 0;
+
+ *isblock = true;
+ if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
return -EFSCORRUPTED;
- *vp = rval;
return 0;
}
@@ -620,14 +636,20 @@ xfs_dir2_isblock(
int
xfs_dir2_isleaf(
struct xfs_da_args *args,
- int *vp) /* out: 1 is block, 0 is not block */
+ bool *isleaf)
{
- xfs_fileoff_t last; /* last file offset */
- int rval;
+ xfs_fileoff_t eof;
+ int error;
- if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
- return rval;
- *vp = last == args->geo->leafblk + args->geo->fsbcount;
+ error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ *isleaf = false;
+ if (eof != args->geo->leafblk + args->geo->fsbcount)
+ return 0;
+
+ *isleaf = true;
return 0;
}
@@ -687,7 +709,7 @@ xfs_dir2_shrink_inode(
/*
* If the block isn't the last one in the directory, we're done.
*/
- if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
+ if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
return 0;
bno = da;
if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
@@ -703,7 +725,7 @@ xfs_dir2_shrink_inode(
/*
* Set the size to the new last block.
*/
- dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+ dp->i_disk_size = XFS_FSB_TO_B(mp, bno);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
@@ -728,9 +750,9 @@ xfs_dir2_namecheck(
xfs_dahash_t
xfs_dir2_hashname(
struct xfs_mount *mp,
- struct xfs_name *name)
+ const struct xfs_name *name)
{
- if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb)))
+ if (unlikely(xfs_has_asciici(mp)))
return xfs_ascii_ci_hashname(name);
return xfs_da_hashname(name->name, name->len);
}
@@ -741,7 +763,7 @@ xfs_dir2_compname(
const unsigned char *name,
int len)
{
- if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb)))
+ if (unlikely(xfs_has_asciici(args->dp->i_mount)))
return xfs_ascii_ci_compname(args, name, len);
return xfs_da_compname(args, name, len);
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 033777e282f2..dd39f17dd9a9 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -21,7 +21,7 @@ struct xfs_dir2_data_unused;
struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
-extern struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dotdot;
/*
* Convert inode mode to directory entry filetype
@@ -39,18 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t *inum,
+ const struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
-extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp,
- xfs_ino_t inum);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
@@ -63,8 +61,8 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
-extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isblock(struct xfs_da_args *args, bool *isblock);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, bool *isleaf);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index d6ced59b9567..00f960a703b2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,10 +53,10 @@ xfs_dir3_block_verify(
if (!xfs_verify_magic(bp, hdr3->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -71,7 +71,7 @@ xfs_dir3_block_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -96,7 +96,7 @@ xfs_dir3_block_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.verify_struct = xfs_dir3_block_verify,
};
+static xfs_failaddr_t
+xfs_dir3_block_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_dir3_block_read(
struct xfs_trans *tp,
@@ -121,12 +138,24 @@ xfs_dir3_block_read(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_block_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}
@@ -142,10 +171,10 @@ xfs_dir3_block_init(
bp->b_ops = &xfs_dir3_block_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
- hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
@@ -813,7 +842,7 @@ xfs_dir2_block_removename(
* See if the size as a shortform is good enough.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return 0;
/*
@@ -932,7 +961,7 @@ xfs_dir2_leaf_to_block(
* been left behind during no-space-reservation operations.
* These will show up in the leaf bests table.
*/
- while (dp->i_d.di_size > args->geo->blksize) {
+ while (dp->i_disk_size > args->geo->blksize) {
int hdrsz;
hdrsz = args->geo->data_entry_offset;
@@ -1026,7 +1055,7 @@ xfs_dir2_leaf_to_block(
* Now see if the resulting block can be shrunken to shortform.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return 0;
return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
@@ -1042,7 +1071,7 @@ xfs_dir2_sf_to_block(
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
struct xfs_da_geometry *geo = args->geo;
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
xfs_dir2_data_hdr_t *hdr; /* block header */
@@ -1067,15 +1096,15 @@ xfs_dir2_sf_to_block(
trace_xfs_dir2_sf_to_block(args);
- ASSERT(ifp->if_flags & XFS_IFINLINE);
- ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
- ASSERT(ifp->if_bytes == dp->i_d.di_size);
+ ASSERT(ifp->if_bytes == dp->i_disk_size);
ASSERT(ifp->if_u1.if_data != NULL);
- ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
- ASSERT(dp->i_d.di_nextents == 0);
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
+ ASSERT(dp->i_df.if_nextents == 0);
/*
* Copy the directory into a temporary buffer.
@@ -1086,7 +1115,7 @@ xfs_dir2_sf_to_block(
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
- dp->i_d.di_size = 0;
+ dp->i_disk_size = 0;
/*
* Add block 0 to the inode.
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index b9eba8213180..dbcf58979a59 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -29,7 +29,7 @@ xfs_dir2_data_bestfree_p(
struct xfs_mount *mp,
struct xfs_dir2_data_hdr *hdr)
{
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
return hdr->bestfree;
}
@@ -51,7 +51,7 @@ xfs_dir2_data_get_ftype(
struct xfs_mount *mp,
struct xfs_dir2_data_entry *dep)
{
- if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (xfs_has_ftype(mp)) {
uint8_t ftype = dep->name[dep->namelen];
if (likely(ftype < XFS_DIR3_FT_MAX))
@@ -70,7 +70,7 @@ xfs_dir2_data_put_ftype(
ASSERT(ftype < XFS_DIR3_FT_MAX);
ASSERT(dep->namelen != 0);
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
dep->name[dep->namelen] = ftype;
}
@@ -218,7 +218,7 @@ __xfs_dir3_data_check(
*/
if (dep->namelen == 0)
return __this_address;
- if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)))
+ if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
return __this_address;
if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
return __this_address;
@@ -297,10 +297,10 @@ xfs_dir3_data_verify(
if (!xfs_verify_magic(bp, hdr3->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -343,7 +343,7 @@ xfs_dir3_data_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -368,7 +368,7 @@ xfs_dir3_data_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -394,6 +394,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.verify_write = xfs_dir3_data_write_verify,
};
+static xfs_failaddr_t
+xfs_dir3_data_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_has_crc(mp)) {
+ struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
int
xfs_dir3_data_read(
@@ -403,12 +419,24 @@ xfs_dir3_data_read(
unsigned int flags,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK,
&xfs_dir3_data_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_data_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
}
@@ -689,12 +717,12 @@ xfs_dir3_data_init(
* Initialize the header.
*/
hdr = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
memset(hdr3, 0, sizeof(*hdr3));
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
- hdr3->blkno = cpu_to_be64(bp->b_bn);
+ hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a131b520aac7..cb9e950a911d 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -37,7 +37,7 @@ xfs_dir2_leaf_hdr_from_disk(
struct xfs_dir3_icleaf_hdr *to,
struct xfs_dir2_leaf *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from;
to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
@@ -68,7 +68,7 @@ xfs_dir2_leaf_hdr_to_disk(
struct xfs_dir2_leaf *to,
struct xfs_dir3_icleaf_hdr *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to;
ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
@@ -108,12 +108,12 @@ xfs_dir3_leaf1_check(
if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
return __this_address;
} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
return __this_address;
- return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false);
}
static inline void
@@ -139,29 +139,34 @@ xfs_failaddr_t
xfs_dir3_leaf_check_int(
struct xfs_mount *mp,
struct xfs_dir3_icleaf_hdr *hdr,
- struct xfs_dir2_leaf *leaf)
+ struct xfs_dir2_leaf *leaf,
+ bool expensive_checking)
{
struct xfs_da_geometry *geo = mp->m_dir_geo;
xfs_dir2_leaf_tail_t *ltp;
int stale;
int i;
+ bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+ hdr->magic == XFS_DIR3_LEAF1_MAGIC);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
/*
* XXX (dgc): This value is not restrictive enough.
* Should factor in the size of the bests table as well.
- * We can deduce a value for that from di_size.
+ * We can deduce a value for that from i_disk_size.
*/
if (hdr->count > geo->leaf_max_ents)
return __this_address;
/* Leaves and bests don't overlap in leaf format. */
- if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
- hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+ if (isleaf1 &&
(char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
return __this_address;
+ if (!expensive_checking)
+ return NULL;
+
/* Check hash value order, count stale entries. */
for (i = stale = 0; i < hdr->count; i++) {
if (i + 1 < hdr->count) {
@@ -171,6 +176,10 @@ xfs_dir3_leaf_check_int(
}
if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
+ if (isleaf1 && xfs_dir2_dataptr_to_db(geo,
+ be32_to_cpu(hdr->ents[i].address)) >=
+ be32_to_cpu(ltp->bestcount))
+ return __this_address;
}
if (hdr->stale != stale)
return __this_address;
@@ -195,7 +204,7 @@ xfs_dir3_leaf_verify(
return fa;
xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr);
- return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr);
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr, true);
}
static void
@@ -205,7 +214,7 @@ xfs_dir3_leaf_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -230,7 +239,7 @@ xfs_dir3_leaf_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -304,7 +313,7 @@ xfs_dir3_leaf_init(
ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
memset(leaf3, 0, sizeof(*leaf3));
@@ -312,7 +321,7 @@ xfs_dir3_leaf_init(
leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
- leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+ leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
leaf3->info.owner = cpu_to_be64(owner);
uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
@@ -1383,7 +1392,7 @@ xfs_dir2_leaf_removename(
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[db]) != oldbest) {
- xfs_buf_corruption_error(lbp);
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
}
/*
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index a0cc5e240306..7a03aeb9f4c9 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -68,12 +68,12 @@ xfs_dir3_leafn_check(
if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+ if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp))
return __this_address;
} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
return __this_address;
- return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf, false);
}
static inline void
@@ -105,12 +105,12 @@ xfs_dir3_free_verify(
if (!xfs_verify_magic(bp, hdr->magic))
return __this_address;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp))
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
@@ -128,7 +128,7 @@ xfs_dir3_free_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -153,7 +153,7 @@ xfs_dir3_free_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
@@ -185,7 +185,7 @@ xfs_dir3_free_header_check(
firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
maxbests;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
if (be32_to_cpu(hdr3->firstdb) != firstdb)
@@ -194,6 +194,8 @@ xfs_dir3_free_header_check(
return __this_address;
if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
return __this_address;
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
} else {
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
@@ -226,8 +228,9 @@ __xfs_dir3_free_read(
/* Check things that we can't do in the verifier. */
fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
if (fa) {
- xfs_verifier_error(*bpp, -EFSCORRUPTED, fa);
+ __xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
return -EFSCORRUPTED;
}
@@ -244,7 +247,7 @@ xfs_dir2_free_hdr_from_disk(
struct xfs_dir3_icfree_hdr *to,
struct xfs_dir2_free *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from;
to->magic = be32_to_cpu(from3->hdr.hdr.magic);
@@ -271,7 +274,7 @@ xfs_dir2_free_hdr_to_disk(
struct xfs_dir2_free *to,
struct xfs_dir3_icfree_hdr *from)
{
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to;
ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
@@ -338,12 +341,12 @@ xfs_dir3_free_get_buf(
memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
memset(&hdr, 0, sizeof(hdr));
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
hdr.magic = XFS_DIR3_FREE_MAGIC;
- hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+ hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp));
hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
@@ -438,8 +441,8 @@ xfs_dir2_leaf_to_node(
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
if (be32_to_cpu(ltp->bestcount) >
- (uint)dp->i_d.di_size / args->geo->blksize) {
- xfs_buf_corruption_error(lbp);
+ (uint)dp->i_disk_size / args->geo->blksize) {
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
}
@@ -513,7 +516,7 @@ xfs_dir2_leafn_add(
* into other peoples memory
*/
if (index < 0) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -800,7 +803,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir3_leaf_check(dp, bp);
if (leafhdr.count <= 0) {
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
}
@@ -2012,9 +2015,7 @@ xfs_dir2_node_addname(
/*
* Allocate and initialize the state (btree cursor).
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
/*
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
@@ -2083,9 +2084,8 @@ xfs_dir2_node_lookup(
/*
* Allocate and initialize the btree cursor.
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
+
/*
* Fill in the path to the entry in the cursor.
*/
@@ -2136,9 +2136,7 @@ xfs_dir2_node_removename(
/*
* Allocate and initialize the btree cursor.
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
/* Look up the entry we're deleting, set up the cursor. */
error = xfs_da3_node_lookup_int(state, &rval);
@@ -2203,9 +2201,7 @@ xfs_dir2_node_replace(
/*
* Allocate and initialize the btree cursor.
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
/*
* We have to save new inode number and ftype since
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 01ee0b926572..7404a9ff1a92 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr {
};
/* xfs_dir2.c */
-xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name);
+xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name);
enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -127,7 +127,8 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp,
- struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf,
+ bool expensive_checks);
/* xfs_dir2_node.c */
void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp,
@@ -195,12 +196,13 @@ xfs_dir2_data_entsize(
len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen +
sizeof(xfs_dir2_data_off_t) /* tag */;
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
len += sizeof(uint8_t);
return round_up(len, XFS_DIR2_DATA_ALIGN);
}
-xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name);
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp,
+ const struct xfs_name *name);
enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 7b7f6fb2ea3b..8cd37e6e9d38 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -48,7 +48,7 @@ xfs_dir2_sf_entsize(
count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
count += sizeof(uint8_t);
return count;
}
@@ -76,7 +76,7 @@ xfs_dir2_sf_get_ino(
{
uint8_t *from = sfep->name + sfep->namelen;
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
from++;
if (!hdr->i8count)
@@ -95,7 +95,7 @@ xfs_dir2_sf_put_ino(
ASSERT(ino <= XFS_MAXINUMBER);
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
to++;
if (hdr->i8count)
@@ -135,7 +135,7 @@ xfs_dir2_sf_get_ftype(
struct xfs_mount *mp,
struct xfs_dir2_sf_entry *sfep)
{
- if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (xfs_has_ftype(mp)) {
uint8_t ftype = sfep->name[sfep->namelen];
if (ftype < XFS_DIR3_FT_MAX)
@@ -153,7 +153,7 @@ xfs_dir2_sf_put_ftype(
{
ASSERT(ftype < XFS_DIR3_FT_MAX);
- if (xfs_sb_version_hasftype(&mp->m_sb))
+ if (xfs_has_ftype(mp))
sfep->name[sfep->namelen] = ftype;
}
@@ -192,7 +192,7 @@ xfs_dir2_block_sfsize(
* if there is a filetype field, add the extra byte to the namelen
* for each entry that we see.
*/
- has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+ has_ftype = xfs_has_ftype(mp) ? 1 : 0;
count = i8count = namelen = 0;
btp = xfs_dir2_block_tail_p(geo, hdr);
@@ -237,7 +237,7 @@ xfs_dir2_block_sfsize(
(i8count ? /* inumber */
count * XFS_INO64_SIZE :
count * XFS_INO32_SIZE);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return size; /* size value is a failure */
}
/*
@@ -343,8 +343,8 @@ xfs_dir2_block_to_sf(
*/
ASSERT(dp->i_df.if_bytes == 0);
xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size);
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- dp->i_d.di_size = size;
+ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+ dp->i_disk_size = size;
logflags |= XFS_ILOG_DDATA;
xfs_dir2_sf_check(args);
@@ -367,7 +367,7 @@ xfs_dir2_sf_addname(
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int incr_isize; /* total change in size */
- int new_isize; /* di_size after adding name */
+ int new_isize; /* size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
int pick; /* which algorithm to use */
@@ -378,12 +378,12 @@ xfs_dir2_sf_addname(
ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
dp = args->dp;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Compute entry (and change in) size.
*/
@@ -401,12 +401,12 @@ xfs_dir2_sf_addname(
objchange = 1;
}
- new_isize = (int)dp->i_d.di_size + incr_isize;
+ new_isize = (int)dp->i_disk_size + incr_isize;
/*
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
*/
- if (new_isize > XFS_IFORK_DSIZE(dp) ||
+ if (new_isize > xfs_inode_data_fork_size(dp) ||
(pick =
xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
/*
@@ -492,7 +492,7 @@ xfs_dir2_sf_addname_easy(
sfp->count++;
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
sfp->i8count++;
- dp->i_d.di_size = new_isize;
+ dp->i_disk_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -519,7 +519,7 @@ xfs_dir2_sf_addname_hard(
int nbytes; /* temp for byte copies */
xfs_dir2_data_aoff_t new_offset; /* next offset value */
xfs_dir2_data_aoff_t offset; /* current offset value */
- int old_isize; /* previous di_size */
+ int old_isize; /* previous size */
xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */
xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
@@ -529,7 +529,7 @@ xfs_dir2_sf_addname_hard(
* Copy the old directory to the stack buffer.
*/
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- old_isize = (int)dp->i_d.di_size;
+ old_isize = (int)dp->i_disk_size;
buf = kmem_alloc(old_isize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
memcpy(oldsfp, sfp, old_isize);
@@ -586,7 +586,7 @@ xfs_dir2_sf_addname_hard(
memcpy(sfep, oldsfep, old_isize - nbytes);
}
kmem_free(buf);
- dp->i_d.di_size = new_isize;
+ dp->i_disk_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -697,7 +697,7 @@ xfs_dir2_sf_check(
ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX);
}
ASSERT(i8count == sfp->i8count);
- ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+ ASSERT((char *)sfep - (char *)sfp == dp->i_disk_size);
ASSERT(offset +
(sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
(uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
@@ -710,11 +710,11 @@ xfs_dir2_sf_verify(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
char *endp;
- struct xfs_ifork *ifp;
xfs_ino_t ino;
int i;
int i8count;
@@ -723,9 +723,8 @@ xfs_dir2_sf_verify(
int error;
uint8_t filetype;
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
size = ifp->if_bytes;
@@ -822,18 +821,16 @@ xfs_dir2_sf_create(
dp = args->dp;
ASSERT(dp != NULL);
- ASSERT(dp->i_d.di_size == 0);
+ ASSERT(dp->i_disk_size == 0);
/*
* If it's currently a zero-length extent file,
* convert it to local format.
*/
- if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
- dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) {
+ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
- dp->i_df.if_flags |= XFS_IFINLINE;
}
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_df.if_bytes == 0);
i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
size = xfs_dir2_sf_hdr_size(i8count);
@@ -851,7 +848,7 @@ xfs_dir2_sf_create(
*/
xfs_dir2_sf_put_parent_ino(sfp, pino);
sfp->count = 0;
- dp->i_d.di_size = size;
+ dp->i_disk_size = size;
xfs_dir2_sf_check(args);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
return 0;
@@ -868,7 +865,6 @@ xfs_dir2_sf_lookup(
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
- int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
enum xfs_dacmp cmp; /* comparison result */
@@ -878,12 +874,12 @@ xfs_dir2_sf_lookup(
xfs_dir2_sf_check(args);
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Special case for .
*/
@@ -932,8 +928,7 @@ xfs_dir2_sf_lookup(
if (!ci_sfep)
return -ENOENT;
/* otherwise process the CI match as required by the caller */
- error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
- return error;
+ return xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
}
/*
@@ -955,8 +950,8 @@ xfs_dir2_sf_removename(
trace_xfs_dir2_sf_removename(args);
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- oldsize = (int)dp->i_d.di_size;
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ oldsize = (int)dp->i_disk_size;
ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == oldsize);
ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -996,7 +991,7 @@ xfs_dir2_sf_removename(
* Fix up the header and file size.
*/
sfp->count--;
- dp->i_d.di_size = newsize;
+ dp->i_disk_size = newsize;
/*
* Reallocate, making it smaller.
*/
@@ -1019,7 +1014,7 @@ xfs_dir2_sf_removename(
/*
* Check whether the sf dir replace operation need more blocks.
*/
-bool
+static bool
xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
@@ -1027,14 +1022,14 @@ xfs_dir2_sf_replace_needblock(
int newsize;
struct xfs_dir2_sf_hdr *sfp;
- if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
return false;
sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
return inum > XFS_DIR2_MAX_SHORT_INUM &&
- sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+ sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp);
}
/*
@@ -1054,12 +1049,12 @@ xfs_dir2_sf_replace(
trace_xfs_dir2_sf_replace(args);
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+ ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* New inode number is large, and need to convert to 8-byte inodes.
@@ -1220,7 +1215,7 @@ xfs_dir2_sf_toino4(
* Clean up the inode.
*/
kmem_free(buf);
- dp->i_d.di_size = newsize;
+ dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
@@ -1293,6 +1288,6 @@ xfs_dir2_sf_toino8(
* Clean up the inode.
*/
kmem_free(buf);
- dp->i_d.di_size = newsize;
+ dp->i_disk_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index bedc1e752b60..15a362e2f5ea 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -22,7 +22,7 @@ xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
ASSERT(nbblks > 0);
- return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
+ return BBTOB(nbblks) / sizeof(struct xfs_dqblk);
}
/*
@@ -37,9 +37,10 @@ xfs_failaddr_t
xfs_dquot_verify(
struct xfs_mount *mp,
struct xfs_disk_dquot *ddq,
- xfs_dqid_t id,
- uint type) /* used only during quotacheck */
+ xfs_dqid_t id) /* used only during quotacheck */
{
+ __u8 ddq_type;
+
/*
* We can encounter an uninitialized dquot buffer for 2 reasons:
* 1. If we crash while deleting the quotainode(s), and those blks got
@@ -60,11 +61,19 @@ xfs_dquot_verify(
if (ddq->d_version != XFS_DQUOT_VERSION)
return __this_address;
- if (type && ddq->d_flags != type)
+ if (ddq->d_type & ~XFS_DQTYPE_ANY)
return __this_address;
- if (ddq->d_flags != XFS_DQ_USER &&
- ddq->d_flags != XFS_DQ_PROJ &&
- ddq->d_flags != XFS_DQ_GROUP)
+ ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK;
+ if (ddq_type != XFS_DQTYPE_USER &&
+ ddq_type != XFS_DQTYPE_PROJ &&
+ ddq_type != XFS_DQTYPE_GROUP)
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
+ !xfs_has_bigtime(mp))
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
return __this_address;
if (id != -1 && id != be32_to_cpu(ddq->d_id))
@@ -95,14 +104,13 @@ xfs_failaddr_t
xfs_dqblk_verify(
struct xfs_mount *mp,
struct xfs_dqblk *dqb,
- xfs_dqid_t id,
- uint type) /* used only during quotacheck */
+ xfs_dqid_t id) /* used only during quotacheck */
{
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type);
+ return xfs_dquot_verify(mp, &dqb->dd_diskdq, id);
}
/*
@@ -113,20 +121,20 @@ xfs_dqblk_repair(
struct xfs_mount *mp,
struct xfs_dqblk *dqb,
xfs_dqid_t id,
- uint type)
+ xfs_dqtype_t type)
{
/*
* Typically, a repair is only requested by quotacheck.
*/
ASSERT(id != -1);
- memset(dqb, 0, sizeof(xfs_dqblk_t));
+ memset(dqb, 0, sizeof(struct xfs_dqblk));
dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- dqb->dd_diskdq.d_flags = type;
+ dqb->dd_diskdq.d_type = type;
dqb->dd_diskdq.d_id = cpu_to_be32(id);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -143,7 +151,7 @@ xfs_dquot_buf_verify_crc(
int ndquots;
int i;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return true;
/*
@@ -205,7 +213,7 @@ xfs_dquot_buf_verify(
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
+ fa = xfs_dqblk_verify(mp, &dqb[i], id + i);
if (fa) {
if (!readahead)
xfs_buf_verifier_error(bp, -EFSCORRUPTED,
@@ -287,3 +295,31 @@ const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+
+/* Convert an on-disk timer value into an incore timer value. */
+time64_t
+xfs_dquot_from_disk_ts(
+ struct xfs_disk_dquot *ddq,
+ __be32 dtimer)
+{
+ uint32_t t = be32_to_cpu(dtimer);
+
+ if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME))
+ return xfs_dq_bigtime_to_unix(t);
+
+ return t;
+}
+
+/* Convert an incore timer value into an on-disk timer value. */
+__be32
+xfs_dquot_to_disk_ts(
+ struct xfs_dquot *dqp,
+ time64_t timer)
+{
+ uint32_t t = timer;
+
+ if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME))
+ t = xfs_dq_unix_to_bigtime(timer);
+
+ return cpu_to_be32(t);
+}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 79e6c4fb1d8a..5362908164b0 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
* Copyright (C) 2017 Oracle.
@@ -55,7 +55,14 @@
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
#define XFS_ERRTAG_IUNLINK_FALLBACK 34
-#define XFS_ERRTAG_MAX 35
+#define XFS_ERRTAG_BUF_IOERROR 35
+#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
+#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
+#define XFS_ERRTAG_AG_RESV_FAIL 38
+#define XFS_ERRTAG_LARP 39
+#define XFS_ERRTAG_DA_LEAF_SPLIT 40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
+#define XFS_ERRTAG_MAX 42
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -95,5 +102,12 @@
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
+#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
+#define XFS_RANDOM_AG_RESV_FAIL 1
+#define XFS_RANDOM_LARP 1
+#define XFS_RANDOM_DA_LEAF_SPLIT 1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 77e9fa385980..371dc07233e0 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -9,7 +9,7 @@
/*
* XFS On Disk Format Definitions
*
- * This header file defines all the on-disk format definitions for
+ * This header file defines all the on-disk format definitions for
* general XFS objects. Directory and attribute related objects are defined in
* xfs_da_format.h, which log and log item formats are defined in
* xfs_log_format.h. Everything else goes here.
@@ -184,7 +184,7 @@ typedef struct xfs_sb {
* Superblock - on disk version. Must match the in core version above.
* Must be padded to 64 bit alignment.
*/
-typedef struct xfs_dsb {
+struct xfs_dsb {
__be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
__be32 sb_blocksize; /* logical block size, bytes */
__be64 sb_dblocks; /* number of data blocks */
@@ -263,8 +263,7 @@ typedef struct xfs_dsb {
uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
-} xfs_dsb_t;
-
+};
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
@@ -280,37 +279,9 @@ typedef struct xfs_dsb {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-/*
- * The first XFS version we support is a v4 superblock with V2 directories.
- */
-static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
-{
- if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
- return false;
- if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
- return false;
-
- /* check for unknown features in the fs */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
- return true;
-}
-
-static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
-{
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
- return true;
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
- return xfs_sb_good_v4_features(sbp);
- return false;
-}
-
-static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp)
+static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
{
- return sbp->sb_rblocks > 0;
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
/*
@@ -322,9 +293,10 @@ static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
{
- return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
+ return xfs_sb_is_v5(sbp) ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}
static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
@@ -332,87 +304,18 @@ static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
}
-static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
-}
-
static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
}
-static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
-}
-
-static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-
-static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
-}
-
-static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
-}
-
-static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
-{
- return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
-}
-
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
-}
-
-/*
- * sb_features2 bit version macros.
- */
-static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
-}
-
-static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
-}
-
static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}
-static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
-{
- sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
- if (!sbp->sb_features2)
- sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
-}
-
-static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
-}
-
-static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
+static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
@@ -449,10 +352,12 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
- XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ XFS_SB_FEAT_RO_COMPAT_REFLINK| \
+ XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -465,10 +370,16 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
+#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
- XFS_SB_FEAT_INCOMPAT_META_UUID)
+ XFS_SB_FEAT_INCOMPAT_META_UUID| \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME| \
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+ XFS_SB_FEAT_INCOMPAT_NREXT64)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -479,7 +390,9 @@ xfs_sb_has_incompat_feature(
return (sbp->sb_features_incompat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+ (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
@@ -489,67 +402,27 @@ xfs_sb_has_incompat_log_feature(
return (sbp->sb_features_log_incompat & feature) != 0;
}
-/*
- * V5 superblock specific feature checks
- */
-static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
-}
-
-static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
- (xfs_sb_version_hasmorebits(sbp) &&
- (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
-}
-
-static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
-}
-
-static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
-}
-
-/*
- * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID
- * is stored separately from the user-visible UUID; this allows the
- * user-visible UUID to be changed on V5 filesystems which have a
- * filesystem UUID stamped into every piece of metadata.
- */
-static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
+static inline void
+xfs_sb_remove_incompat_log_features(
+ struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
+ sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL;
}
-static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
+static inline void
+xfs_sb_add_incompat_log_features(
+ struct xfs_sb *sbp,
+ unsigned int features)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
+ sbp->sb_features_log_incompat |= features;
}
-static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
+ return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
}
-/*
- * end of superblock version macros
- */
-
static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
{
@@ -560,7 +433,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
@@ -588,7 +460,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
#define XFS_B_TO_FSB(mp,b) \
((((uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
#define XFS_B_TO_FSBT(mp,b) (((uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
-#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
/*
* Allocation group header
@@ -663,26 +534,26 @@ typedef struct xfs_agf {
#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
-#define XFS_AGF_MAGICNUM 0x00000001
-#define XFS_AGF_VERSIONNUM 0x00000002
-#define XFS_AGF_SEQNO 0x00000004
-#define XFS_AGF_LENGTH 0x00000008
-#define XFS_AGF_ROOTS 0x00000010
-#define XFS_AGF_LEVELS 0x00000020
-#define XFS_AGF_FLFIRST 0x00000040
-#define XFS_AGF_FLLAST 0x00000080
-#define XFS_AGF_FLCOUNT 0x00000100
-#define XFS_AGF_FREEBLKS 0x00000200
-#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
-#define XFS_AGF_REFCOUNT_ROOT 0x00008000
-#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
-#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_MAGICNUM (1u << 0)
+#define XFS_AGF_VERSIONNUM (1u << 1)
+#define XFS_AGF_SEQNO (1u << 2)
+#define XFS_AGF_LENGTH (1u << 3)
+#define XFS_AGF_ROOTS (1u << 4)
+#define XFS_AGF_LEVELS (1u << 5)
+#define XFS_AGF_FLFIRST (1u << 6)
+#define XFS_AGF_FLLAST (1u << 7)
+#define XFS_AGF_FLCOUNT (1u << 8)
+#define XFS_AGF_FREEBLKS (1u << 9)
+#define XFS_AGF_LONGEST (1u << 10)
+#define XFS_AGF_BTREEBLKS (1u << 11)
+#define XFS_AGF_UUID (1u << 12)
+#define XFS_AGF_RMAP_BLOCKS (1u << 13)
+#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define XFS_AGF_REFCOUNT_ROOT (1u << 15)
+#define XFS_AGF_REFCOUNT_LEVEL (1u << 16)
+#define XFS_AGF_SPARE64 (1u << 17)
#define XFS_AGF_NUM_BITS 18
-#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
{ XFS_AGF_MAGICNUM, "MAGICNUM" }, \
@@ -707,7 +578,6 @@ typedef struct xfs_agf {
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
/*
* Size of the unlinked inode hash table in the agi.
@@ -750,32 +620,35 @@ typedef struct xfs_agi {
__be32 agi_free_root; /* root of the free inode btree */
__be32 agi_free_level;/* levels in free inode btree */
+ __be32 agi_iblocks; /* inobt blocks used */
+ __be32 agi_fblocks; /* finobt blocks used */
+
/* structure must be padded to 64 bit alignment */
} xfs_agi_t;
#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM (1 << 0)
-#define XFS_AGI_VERSIONNUM (1 << 1)
-#define XFS_AGI_SEQNO (1 << 2)
-#define XFS_AGI_LENGTH (1 << 3)
-#define XFS_AGI_COUNT (1 << 4)
-#define XFS_AGI_ROOT (1 << 5)
-#define XFS_AGI_LEVEL (1 << 6)
-#define XFS_AGI_FREECOUNT (1 << 7)
-#define XFS_AGI_NEWINO (1 << 8)
-#define XFS_AGI_DIRINO (1 << 9)
-#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_MAGICNUM (1u << 0)
+#define XFS_AGI_VERSIONNUM (1u << 1)
+#define XFS_AGI_SEQNO (1u << 2)
+#define XFS_AGI_LENGTH (1u << 3)
+#define XFS_AGI_COUNT (1u << 4)
+#define XFS_AGI_ROOT (1u << 5)
+#define XFS_AGI_LEVEL (1u << 6)
+#define XFS_AGI_FREECOUNT (1u << 7)
+#define XFS_AGI_NEWINO (1u << 8)
+#define XFS_AGI_DIRINO (1u << 9)
+#define XFS_AGI_UNLINKED (1u << 10)
#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT (1 << 11)
-#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_NUM_BITS_R2 13
+#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1u << 11)
+#define XFS_AGI_FREE_LEVEL (1u << 12)
+#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
/*
* The third a.g. block contains the a.g. freelist, an array
@@ -783,21 +656,15 @@ typedef struct xfs_agi {
*/
#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
-
-#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
- &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
- (__be32 *)(bp)->b_addr)
+#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr))
-typedef struct xfs_agfl {
+struct xfs_agfl {
__be32 agfl_magicnum;
__be32 agfl_seqno;
uuid_t agfl_uuid;
__be64 agfl_lsn;
__be32 agfl_crc;
- __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */
-} __attribute__((packed)) xfs_agfl_t;
+} __attribute__((packed));
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
@@ -823,10 +690,87 @@ typedef struct xfs_agfl {
ASSERT(xfs_daddr_to_agno(mp, d) == \
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-typedef struct xfs_timestamp {
+/*
+ * XFS Timestamps
+ * ==============
+ *
+ * Traditional ondisk inode timestamps consist of signed 32-bit counters for
+ * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC
+ * 1970, which means that the timestamp epoch is the same as the Unix epoch.
+ * Therefore, the ondisk min and max defined here can be used directly to
+ * constrain the incore timestamps on a Unix system. Note that we actually
+ * encode a __be64 value on disk.
+ *
+ * When the bigtime feature is enabled, ondisk inode timestamps become an
+ * unsigned 64-bit nanoseconds counter. This means that the bigtime inode
+ * timestamp epoch is the start of the classic timestamp range, which is
+ * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * /must/ use the bigtime conversion functions when encoding and decoding raw
+ * timestamps.
+ */
+typedef __be64 xfs_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_timestamp {
__be32 t_sec; /* timestamp seconds */
__be32 t_nsec; /* timestamp nanoseconds */
-} xfs_timestamp_t;
+};
+
+/*
+ * Smallest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN)
+
+/*
+ * Largest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
+ */
+#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX)
+
+/*
+ * Smallest possible ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with the traditional
+ * minimum timestamp of Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_BIGTIME_TIME_MIN ((int64_t)0)
+
+/*
+ * Largest supported ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with an incore timestamp
+ * of Jul 2 20:20:24 UTC 2486.
+ *
+ * We round down the ondisk limit so that the bigtime quota and inode max
+ * timestamps will be the same.
+ */
+#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))
+
+/*
+ * Bigtime epoch is set exactly to the minimum time value that a traditional
+ * 32-bit timestamp can represent when using the Unix epoch as a reference.
+ * Hence the Unix epoch is at a fixed offset into the supported bigtime
+ * timestamp range.
+ *
+ * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
+ * timestamp can represent so we will not lose any fidelity in converting
+ * to/from unix and bigtime timestamps.
+ *
+ * The following conversion factor converts a seconds counter from the Unix
+ * epoch to the bigtime epoch.
+ */
+#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN)
+
+/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
+static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
+{
+ return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
+static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
+}
/*
* On-disk inode structure.
@@ -838,15 +782,14 @@ typedef struct xfs_timestamp {
* attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros
* below.
*
- * There is a very similar struct icdinode in xfs_inode which matches the
- * layout of the first 96 bytes of this structure, but is kept in native
- * format instead of big endian.
+ * There is a very similar struct xfs_log_dinode which matches the layout of
+ * this structure, but is kept in native format instead of big endian.
*
* Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
* padding field for v3 inodes.
*/
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-typedef struct xfs_dinode {
+struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
@@ -857,16 +800,41 @@ typedef struct xfs_dinode {
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
- __u8 di_pad[6]; /* unused, zeroed space */
- __be16 di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
- __be32 di_nextents; /* number of extents in data fork */
- __be16 di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
@@ -891,7 +859,7 @@ typedef struct xfs_dinode {
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
-} xfs_dinode_t;
+};
#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
@@ -936,6 +904,56 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_UUID, "uuid" }
/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR \
+ min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
* Inode minimum and maximum sizes.
*/
#define XFS_DINODE_MIN_LOG 8
@@ -946,23 +964,22 @@ enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp, version) \
- ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+#define XFS_DINODE_SIZE(mp) \
+ (xfs_has_v3inodes(mp) ? \
+ sizeof(struct xfs_dinode) : \
+ offsetof(struct xfs_dinode, di_crc))
+#define XFS_LITINO(mp) \
+ ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp))
/*
* Inode data & attribute fork sizes, per inode.
*/
-#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp, (dip)->di_version))
+ ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
#define XFS_DFORK_ASIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
- 0)
+ ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
XFS_DFORK_DSIZE(dip, mp) : \
@@ -985,10 +1002,6 @@ enum xfs_dinode_fmt {
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_nextents) : \
- be16_to_cpu((dip)->di_anextents))
/*
* For block and character special files the 32bit dev_t is stored at the
@@ -1054,12 +1067,31 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
+#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
+
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_ANY \
- (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+
+static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
+}
+
+static inline bool xfs_dinode_has_large_extent_counts(
+ const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
/*
* Inode number format:
@@ -1142,16 +1174,111 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
+#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */
+#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */
+#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */
+#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */
+
+/* bitmask to determine if this is a user/group/project dquot */
+#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
+ XFS_DQTYPE_PROJ | \
+ XFS_DQTYPE_GROUP)
+
+#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \
+ XFS_DQTYPE_BIGTIME)
+
+/*
+ * XFS Quota Timers
+ * ================
+ *
+ * Traditional quota grace period expiration timers are an unsigned 32-bit
+ * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970.
+ * Note that an expiration value of zero means that the quota limit has not
+ * been reached, and therefore no expiration has been set. Therefore, the
+ * ondisk min and max defined here can be used directly to constrain the incore
+ * quota expiration timestamps on a Unix system.
+ *
+ * When bigtime is enabled, we trade two bits of precision to expand the
+ * expiration timeout range to match that of big inode timestamps. The min and
+ * max recorded here are the on-disk limits, not a Unix timestamp.
+ *
+ * The grace period for each quota type is stored in the root dquot (id = 0)
+ * and is applied to a non-root dquot when it exceeds the soft or hard limits.
+ * The length of quota grace periods are unsigned 32-bit quantities measured in
+ * units of seconds. A value of zero means to use the default period.
+ */
+
+/*
+ * Smallest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1)
+
+/*
+ * Largest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX)
+
+/*
+ * Smallest possible ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with the incore
+ * expiration of Jan 1 00:00:04 UTC 1970.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN)
+
+/*
+ * Largest supported ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with an incore
+ * expiration of Jul 2 20:20:24 UTC 2486.
+ *
+ * The ondisk field supports values up to -1U, which corresponds to an incore
+ * expiration in 2514. This is beyond the maximum the bigtime inode timestamp,
+ * so we cap the maximum bigtime quota expiration to the max inode timestamp.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U)
+
/*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the struct xfs_dquot that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
+ * The following conversion factors assist in converting a quota expiration
+ * timestamp between the incore and ondisk formats.
+ */
+#define XFS_DQ_BIGTIME_SHIFT (2)
+#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)
+
+/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
+static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
+{
+ /*
+ * Round the expiration timestamp up to the nearest bigtime timestamp
+ * that we can store, to give users the most time to fix problems.
+ */
+ return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
+ XFS_DQ_BIGTIME_SHIFT;
+}
+
+/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
+static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
+}
+
+/*
+ * Default quota grace periods, ranging from zero (use the compiled defaults)
+ * to ~136 years. These are applied to a non-root dquot that has exceeded
+ * either limit.
+ */
+#define XFS_DQ_GRACE_MIN ((int64_t)0)
+#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
+
+/*
+ * This is the main portion of the on-disk representation of quota information
+ * for a user. We pad this with some more expansion room to construct the on
+ * disk structure.
*/
struct xfs_disk_dquot {
__be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
__u8 d_version; /* dquot version */
- __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
+ __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */
__be32 d_id; /* user,project,group id */
__be64 d_blk_hardlimit;/* absolute limit on disk blks */
__be64 d_blk_softlimit;/* preferred limit on disk blks */
@@ -1177,7 +1304,7 @@ struct xfs_disk_dquot {
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
-typedef struct xfs_dqblk {
+struct xfs_dqblk {
struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
char dd_fill[4];/* filling for posterity */
@@ -1187,11 +1314,27 @@ typedef struct xfs_dqblk {
__be32 dd_crc; /* checksum */
__be64 dd_lsn; /* last modification in log */
uuid_t dd_uuid; /* location information */
-} xfs_dqblk_t;
+};
#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
/*
+ * This defines the unit of allocation of dquots.
+ *
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ *
+ * However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ *
+ * This is part of the ondisk format because the structure size is not a power
+ * of two, which leaves slack at the end of the disk block.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+
+/*
* Remote symlink format and access functions.
*/
#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
@@ -1218,7 +1361,7 @@ struct xfs_dsymlink_hdr {
#define XFS_SYMLINK_MAPS 3
#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ ((bufsize) - (xfs_has_crc((mp)) ? \
sizeof(struct xfs_dsymlink_hdr) : 0))
@@ -1421,20 +1564,6 @@ struct xfs_rmap_rec {
#define RMAPBT_UNUSED_OFFSET_BITLEN 7
#define RMAPBT_OFFSET_BITLEN 54
-#define XFS_RMAP_ATTR_FORK (1 << 0)
-#define XFS_RMAP_BMBT_BLOCK (1 << 1)
-#define XFS_RMAP_UNWRITTEN (1 << 2)
-#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
- XFS_RMAP_BMBT_BLOCK)
-#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
-struct xfs_rmap_irec {
- xfs_agblock_t rm_startblock; /* extent start block */
- xfs_extlen_t rm_blockcount; /* extent length */
- uint64_t rm_owner; /* extent owner */
- uint64_t rm_offset; /* offset within the owner */
- unsigned int rm_flags; /* state flags */
-};
-
/*
* Key structure
*
@@ -1450,7 +1579,7 @@ struct xfs_rmap_key {
typedef __be32 xfs_rmap_ptr_t;
#define XFS_RMAP_BLOCK(mp) \
- (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+ (xfs_has_finobt(((mp))) ? \
XFS_FIBT_BLOCK(mp) + 1 : \
XFS_IBT_BLOCK(mp) + 1)
@@ -1483,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp);
* on the startblock. This speeds up mount time deletion of stale
* staging extents because they're all at the right side of the tree.
*/
-#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31))
+#define XFS_REFC_COWFLAG (1U << 31)
#define REFCNTBT_COWFLAG_BITLEN 1
#define REFCNTBT_AGBLOCK_BITLEN 31
@@ -1497,12 +1626,6 @@ struct xfs_refcount_key {
__be32 rc_startblock; /* starting block number */
};
-struct xfs_refcount_irec {
- xfs_agblock_t rc_startblock; /* starting block number */
- xfs_extlen_t rc_blockcount; /* count of free blocks */
- xfs_nlink_t rc_refcount; /* number of inodes linked here */
-};
-
#define MAXREFCOUNT ((xfs_nlink_t)~0U)
#define MAXREFCEXTLEN ((xfs_extlen_t)~0U)
@@ -1542,6 +1665,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
/*
* bmbt records have a file offset (block) field that is 54 bits wide, so this
* is the largest xfs_fileoff_t that we ever expect to see.
@@ -1673,7 +1798,7 @@ struct xfs_acl_entry {
struct xfs_acl {
__be32 acl_cnt;
- struct xfs_acl_entry acl_entry[0];
+ struct xfs_acl_entry acl_entry[];
};
/*
@@ -1682,7 +1807,7 @@ struct xfs_acl {
* limited only by the maximum size of the xattr that stores the information.
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
- (xfs_sb_version_hascrc(&mp->m_sb) \
+ (xfs_has_crc(mp) \
? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index ef95ca07d084..1cfd5bc6520a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright (c) 1995-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -65,7 +65,7 @@ struct getbmapx {
/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */
#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
-#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
+#define BMV_IF_NO_DMAPI_READ 0x2 /* Deprecated */
#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
@@ -93,21 +93,6 @@ struct getbmapx {
#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */
/*
- * Structure for XFS_IOC_FSSETDM.
- * For use by backup and restore programs to set the XFS on-disk inode
- * fields di_dmevmask and di_dmstate. These must be set to exactly and
- * only values previously obtained via xfs_bulkstat! (Specifically the
- * struct xfs_bstat fields bs_dmevmask and bs_dmstate.)
- */
-#ifndef HAVE_FSDMIDATA
-struct fsdmidata {
- __u32 fsd_dmevmask; /* corresponds to di_dmevmask */
- __u16 fsd_padding;
- __u16 fsd_dmstate; /* corresponds to di_dmstate */
-};
-#endif
-
-/*
* File segment locking set data type for 64 bit access.
* Also used for all the RESV/FREE interfaces.
*/
@@ -249,6 +234,9 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
+#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
/*
* Minimum and maximum sizes need for growth checks.
@@ -266,6 +254,8 @@ typedef struct xfs_fsop_resblks {
*/
#define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */
#define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */
+#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
/* keep the maximum size under 2^31 by a small amount */
#define XFS_MAX_LOG_BYTES \
@@ -388,7 +378,7 @@ struct xfs_bulkstat {
uint32_t bs_extsize_blks; /* extent size hint, blocks */
uint32_t bs_nlink; /* number of links */
- uint32_t bs_extents; /* number of extents */
+ uint32_t bs_extents; /* 32-bit data fork extent counter */
uint32_t bs_aextents; /* attribute number of extents */
uint16_t bs_version; /* structure version */
uint16_t bs_forkoff; /* inode fork offset in bytes */
@@ -397,8 +387,9 @@ struct xfs_bulkstat {
uint16_t bs_checked; /* checked inode metadata */
uint16_t bs_mode; /* type and mode */
uint16_t bs_pad2; /* zeroed */
+ uint64_t bs_extents64; /* 64-bit data fork extent counter */
- uint64_t bs_pad[7]; /* zeroed */
+ uint64_t bs_pad[6]; /* zeroed */
};
#define XFS_BULKSTAT_VERSION_V1 (1)
@@ -470,17 +461,28 @@ struct xfs_bulk_ireq {
* Only return results from the specified @agno. If @ino is zero, start
* with the first inode of @agno.
*/
-#define XFS_BULK_IREQ_AGNO (1 << 0)
+#define XFS_BULK_IREQ_AGNO (1U << 0)
/*
* Return bulkstat information for a single inode, where @ino value is a
* special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
* values below. Not compatible with XFS_BULK_IREQ_AGNO.
*/
-#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+#define XFS_BULK_IREQ_SPECIAL (1U << 1)
-#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
- XFS_BULK_IREQ_SPECIAL)
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL | \
+ XFS_BULK_IREQ_NREXT64)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -558,20 +560,44 @@ typedef struct xfs_fsop_handlereq {
/*
* Compound structures for passing args through Handle Request interfaces
- * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
- * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
- * XFS_IOC_ATTRMULTI_BY_HANDLE
+ * xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE
*/
-typedef struct xfs_fsop_setdm_handlereq {
- struct xfs_fsop_handlereq hreq; /* handle information */
- struct fsdmidata __user *data; /* DMAPI data */
-} xfs_fsop_setdm_handlereq_t;
+/*
+ * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface.
+ *
+ * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix.
+ */
+#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */
+#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */
+#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */
+#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */
typedef struct xfs_attrlist_cursor {
__u32 opaque[4];
} xfs_attrlist_cursor_t;
+/*
+ * Define how lists of attribute names are returned to userspace from the
+ * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the
+ * beginning of the returned buffer, and a each entry in al_offset contains the
+ * relative offset of an xfs_attrlist_ent containing the actual entry.
+ *
+ * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and
+ * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr.
+ */
+struct xfs_attrlist {
+ __s32 al_count; /* number of entries in attrlist */
+ __s32 al_more; /* T/F: more attrs (do call again) */
+ __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+};
+
+struct xfs_attrlist_ent { /* data from attr_list() */
+ __u32 a_valuelen; /* number bytes in value of attr */
+ char a_name[1]; /* attr name (NULL terminated) */
+};
+
typedef struct xfs_fsop_attrlist_handlereq {
struct xfs_fsop_handlereq hreq; /* handle interface structure */
struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
@@ -589,7 +615,7 @@ typedef struct xfs_attr_multiop {
void __user *am_attrname;
void __user *am_attrvalue;
__u32 am_length;
- __u32 am_flags;
+ __u32 am_flags; /* XFS_IOC_ATTR_* */
} xfs_attr_multiop_t;
typedef struct xfs_fsop_attrmulti_handlereq {
@@ -686,34 +712,34 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
-#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
/* o: Metadata object needs repair. */
-#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1)
/*
* o: Metadata object could be optimized. It's not corrupt, but
* we could improve on it somehow.
*/
-#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+#define XFS_SCRUB_OFLAG_PREEN (1u << 2)
/* o: Cross-referencing failed. */
-#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+#define XFS_SCRUB_OFLAG_XFAIL (1u << 3)
/* o: Metadata object disagrees with cross-referenced metadata. */
-#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4)
/* o: Scan was not complete. */
-#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5)
/* o: Metadata object looked funny but isn't corrupt. */
-#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+#define XFS_SCRUB_OFLAG_WARNING (1u << 6)
/*
* o: IFLAG_REPAIR was set but metadata object did not need fixing or
* optimization and has therefore not been altered.
*/
-#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
@@ -747,15 +773,15 @@ struct xfs_scrub_metadata {
* For 'documentation' purposed more than anything else,
* the "cmd #" field reflects the IRIX fcntl number.
*/
-#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
-#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
+/* XFS_IOC_ALLOCSP ------- deprecated 10 */
+/* XFS_IOC_FREESP -------- deprecated 11 */
#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
-#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
+/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */
+/* XFS_IOC_FREESP64 ------ deprecated 37 */
#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
-#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata)
+/* XFS_IOC_FSSETDM ------- deprecated 39 */
#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64)
#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64)
#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64)
@@ -797,7 +823,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
-#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4)
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 272005ac8c88..99e796256c5d 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2019 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index bf161e930f1d..94db50eb706a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -10,7 +10,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
@@ -27,6 +26,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
/*
* Lookup a record by ino in the btree given by cur.
@@ -58,7 +58,7 @@ xfs_inobt_update(
union xfs_btree_rec rec;
rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
- if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ if (xfs_has_sparseinodes(cur->bc_mp)) {
rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
rec.inobt.ir_u.sp.ir_count = irec->ir_count;
rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
@@ -74,11 +74,11 @@ xfs_inobt_update(
void
xfs_inobt_btrec_to_irec(
struct xfs_mount *mp,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec)
{
irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
- if (xfs_sb_version_hassparseinodes(&mp->m_sb)) {
+ if (xfs_has_sparseinodes(mp)) {
irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
irec->ir_count = rec->inobt.ir_u.sp.ir_count;
irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
@@ -105,7 +105,6 @@ xfs_inobt_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
union xfs_btree_rec *rec;
int error;
uint64_t realfree;
@@ -116,7 +115,7 @@ xfs_inobt_get_rec(
xfs_inobt_btrec_to_irec(mp, rec, irec);
- if (!xfs_verify_agino(mp, agno, irec->ir_startino))
+ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
goto out_bad_rec;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
irec->ir_count > XFS_INODES_PER_CHUNK)
@@ -137,7 +136,8 @@ xfs_inobt_get_rec(
out_bad_rec:
xfs_warn(mp,
"%s Inode BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno);
+ cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
+ cur->bc_ag.pag->pag_agno);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -172,18 +172,17 @@ xfs_inobt_insert(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_agino_t newino,
xfs_agino_t newlen,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agino_t thisino;
int i;
int error;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
for (thisino = newino;
thisino < newino + newlen;
@@ -215,10 +214,9 @@ xfs_inobt_insert(
* Verify that the number of free inodes in the AGI is correct.
*/
#ifdef DEBUG
-STATIC int
+static int
xfs_check_agi_freecount(
- struct xfs_btree_cur *cur,
- struct xfs_agi *agi)
+ struct xfs_btree_cur *cur)
{
if (cur->bc_nlevels == 1) {
xfs_inobt_rec_incore_t rec;
@@ -243,13 +241,13 @@ xfs_check_agi_freecount(
}
} while (i == 1);
- if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ if (!xfs_is_shutdown(cur->bc_mp))
+ ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
}
return 0;
}
#else
-#define xfs_check_agi_freecount(cur, agi) 0
+#define xfs_check_agi_freecount(cur) 0
#endif
/*
@@ -304,7 +302,7 @@ xfs_ialloc_inode_init(
* That means for v3 inode we log the entire buffer rather than just the
* inode cores.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
@@ -339,7 +337,6 @@ xfs_ialloc_inode_init(
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = xfs_dinode_size(version);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
@@ -356,7 +353,7 @@ xfs_ialloc_inode_init(
} else if (tp) {
/* just log the inode core */
xfs_trans_log_buf(tp, fbuf, ioffset,
- ioffset + isize - 1);
+ ioffset + XFS_DINODE_SIZE(mp) - 1);
}
}
@@ -520,18 +517,17 @@ xfs_inobt_insert_sprec(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
int btnum,
struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
bool merge) /* merge or replace */
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
int error;
int i;
struct xfs_inobt_rec_incore rec;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -578,14 +574,14 @@ xfs_inobt_insert_sprec(
goto error;
}
- trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+ trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
rec.ir_holemask, nrec->ir_startino,
nrec->ir_holemask);
/* merge to nrec to output the updated record */
__xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+ trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
nrec->ir_holemask);
error = xfs_inobt_rec_check_count(mp, nrec);
@@ -606,28 +602,28 @@ error:
}
/*
- * Allocate new inodes in the allocation group specified by agbp.
- * Return 0 for success, else error code.
+ * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
+ * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
+ * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
+ * inode count threshold, or the usual negative error code for other errors.
*/
STATIC int
xfs_ialloc_ag_alloc(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- int *alloc)
+ struct xfs_perag *pag)
{
struct xfs_agi *agi;
struct xfs_alloc_arg args;
- xfs_agnumber_t agno;
int error;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe */
/* unit boundary */
/* init. to full chunk */
- uint16_t allocmask = (uint16_t) -1;
struct xfs_inobt_rec_incore rec;
- struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
+ uint16_t allocmask = (uint16_t) -1;
int do_sparse = 0;
memset(&args, 0, sizeof(args));
@@ -638,9 +634,9 @@ xfs_ialloc_ag_alloc(
#ifdef DEBUG
/* randomly do sparse inode allocations */
- if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+ if (xfs_has_sparseinodes(tp->t_mountp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks)
- do_sparse = prandom_u32() & 1;
+ do_sparse = prandom_u32_max(2);
#endif
/*
@@ -658,16 +654,15 @@ xfs_ialloc_ag_alloc(
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
*/
- agi = XFS_BUF_TO_AGI(agbp);
+ agi = agbp->b_addr;
newino = be32_to_cpu(agi->agi_newino);
- agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
igeo->ialloc_blks;
if (do_sparse)
goto sparse_alloc;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.prod = 1;
@@ -688,7 +683,7 @@ xfs_ialloc_ag_alloc(
args.minalignslop = igeo->cluster_align - 1;
/* Allow space for the inode btree to split. */
- args.minleft = igeo->inobt_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -716,7 +711,7 @@ xfs_ialloc_ag_alloc(
*/
isaligned = 0;
if (igeo->ialloc_align) {
- ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+ ASSERT(!xfs_has_noalign(args.mp));
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
@@ -727,7 +722,7 @@ xfs_ialloc_ag_alloc(
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
@@ -736,7 +731,7 @@ xfs_ialloc_ag_alloc(
/*
* Allow space for the inode btree to split.
*/
- args.minleft = igeo->inobt_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -748,7 +743,7 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.alignment = igeo->cluster_align;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -758,13 +753,13 @@ xfs_ialloc_ag_alloc(
* Finally, try a sparse allocation if the filesystem supports it and
* the sparse allocation length is smaller than a full chunk.
*/
- if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+ if (xfs_has_sparseinodes(args.mp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks &&
args.fsbno == NULLFSBLOCK) {
sparse_alloc:
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.alignment = args.mp->m_sb.sb_spino_align;
args.prod = 1;
@@ -795,10 +790,9 @@ sparse_alloc:
allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
}
- if (args.fsbno == NULLFSBLOCK) {
- *alloc = 0;
- return 0;
- }
+ if (args.fsbno == NULLFSBLOCK)
+ return -EAGAIN;
+
ASSERT(args.len == args.minlen);
/*
@@ -810,8 +804,8 @@ sparse_alloc:
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
- args.agbno, args.len, prandom_u32());
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
+ args.agbno, args.len, get_random_u32());
if (error)
return error;
@@ -837,12 +831,12 @@ sparse_alloc:
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
- error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
- &rec, true);
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ XFS_BTNUM_INO, &rec, true);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
- XFS_AGINO_TO_INO(args.mp, agno,
+ XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
rec.ir_startino),
rec.ir_holemask, rec.ir_count);
xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
@@ -861,22 +855,21 @@ sparse_alloc:
* from the previous call. Set merge false to replace any
* existing record with this one.
*/
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
- XFS_BTNUM_FINO, &rec,
- false);
+ if (xfs_has_finobt(args.mp)) {
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ XFS_BTNUM_FINO, &rec, false);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen,
XFS_BTNUM_INO);
if (error)
return error;
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+ if (xfs_has_finobt(args.mp)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino,
newlen, XFS_BTNUM_FINO);
if (error)
return error;
@@ -888,10 +881,8 @@ sparse_alloc:
*/
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- pag = xfs_perag_get(args.mp, agno);
pag->pagi_freecount += newlen;
pag->pagi_count += newlen;
- xfs_perag_put(pag);
agi->agi_newino = cpu_to_be32(newino);
/*
@@ -904,143 +895,9 @@ sparse_alloc:
*/
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
- *alloc = 1;
return 0;
}
-STATIC xfs_agnumber_t
-xfs_ialloc_next_ag(
- xfs_mount_t *mp)
-{
- xfs_agnumber_t agno;
-
- spin_lock(&mp->m_agirotor_lock);
- agno = mp->m_agirotor;
- if (++mp->m_agirotor >= mp->m_maxagi)
- mp->m_agirotor = 0;
- spin_unlock(&mp->m_agirotor_lock);
-
- return agno;
-}
-
-/*
- * Select an allocation group to look for a free inode in, based on the parent
- * inode and the mode. Return the allocation group buffer.
- */
-STATIC xfs_agnumber_t
-xfs_ialloc_ag_select(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t parent, /* parent directory inode number */
- umode_t mode) /* bits set to indicate file type */
-{
- xfs_agnumber_t agcount; /* number of ag's in the filesystem */
- xfs_agnumber_t agno; /* current ag number */
- int flags; /* alloc buffer locking flags */
- xfs_extlen_t ineed; /* blocks needed for inode allocation */
- xfs_extlen_t longest = 0; /* longest extent available */
- xfs_mount_t *mp; /* mount point structure */
- int needspace; /* file mode implies space allocated */
- xfs_perag_t *pag; /* per allocation group data */
- xfs_agnumber_t pagno; /* parent (starting) ag number */
- int error;
-
- /*
- * Files of these types need at least one block if length > 0
- * (and they won't fit in the inode, but that's hard to figure out).
- */
- needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
- mp = tp->t_mountp;
- agcount = mp->m_maxagi;
- if (S_ISDIR(mode))
- pagno = xfs_ialloc_next_ag(mp);
- else {
- pagno = XFS_INO_TO_AGNO(mp, parent);
- if (pagno >= agcount)
- pagno = 0;
- }
-
- ASSERT(pagno < agcount);
-
- /*
- * Loop through allocation groups, looking for one with a little
- * free space in it. Note we don't look for free inodes, exactly.
- * Instead, we include whether there is a need to allocate inodes
- * to mean that blocks must be allocated for them,
- * if none are currently free.
- */
- agno = pagno;
- flags = XFS_ALLOC_FLAG_TRYLOCK;
- for (;;) {
- pag = xfs_perag_get(mp, agno);
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto nextag;
- }
-
- if (!pag->pagi_init) {
- error = xfs_ialloc_pagi_init(mp, tp, agno);
- if (error)
- goto nextag;
- }
-
- if (pag->pagi_freecount) {
- xfs_perag_put(pag);
- return agno;
- }
-
- if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, agno, flags);
- if (error)
- goto nextag;
- }
-
- /*
- * Check that there is enough free space for the file plus a
- * chunk of inodes if we need to allocate some. If this is the
- * first pass across the AGs, take into account the potential
- * space needed for alignment of inode chunks when checking the
- * longest contiguous free space in the AG - this prevents us
- * from getting ENOSPC because we have free space larger than
- * ialloc_blks but alignment constraints prevent us from using
- * it.
- *
- * If we can't find an AG with space for full alignment slack to
- * be taken into account, we must be near ENOSPC in all AGs.
- * Hence we don't include alignment for the second pass and so
- * if we fail allocation due to alignment issues then it is most
- * likely a real ENOSPC condition.
- */
- ineed = M_IGEO(mp)->ialloc_min_blks;
- if (flags && ineed > 1)
- ineed += M_IGEO(mp)->cluster_align;
- longest = pag->pagf_longest;
- if (!longest)
- longest = pag->pagf_flcount > 0;
-
- if (pag->pagf_freeblks >= needspace + ineed &&
- longest >= ineed) {
- xfs_perag_put(pag);
- return agno;
- }
-nextag:
- xfs_perag_put(pag);
- /*
- * No point in iterating over the rest, if we're shutting
- * down.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
- return NULLAGNUMBER;
- agno++;
- if (agno >= agcount)
- agno = 0;
- if (agno == pagno) {
- if (flags == 0)
- return NULLAGNUMBER;
- flags = 0;
- }
- }
-}
-
/*
* Try to retrieve the next record to the left/right from the current one.
*/
@@ -1126,15 +983,14 @@ STATIC int
xfs_dialloc_ag_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_ino_t parent,
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
- struct xfs_perag *pag;
struct xfs_btree_cur *cur, *tcur;
struct xfs_inobt_rec_incore rec, trec;
xfs_ino_t ino;
@@ -1143,14 +999,12 @@ xfs_dialloc_ag_inobt(
int i, j;
int searchdistance = 10;
- pag = xfs_perag_get(mp, agno);
-
ASSERT(pag->pagi_init);
ASSERT(pag->pagi_inodeok);
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1158,14 +1012,14 @@ xfs_dialloc_ag_inobt(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
/*
* If in the same AG as the parent, try to get near the parent.
*/
- if (pagno == agno) {
+ if (pagno == pag->pag_agno) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
@@ -1368,7 +1222,7 @@ alloc_inode:
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
error = xfs_inobt_update(cur, &rec);
@@ -1378,20 +1232,18 @@ alloc_inode:
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
pag->pagi_freecount--;
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
- xfs_perag_put(pag);
*inop = ino;
return 0;
error1:
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
error0:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- xfs_perag_put(pag);
return error;
}
@@ -1575,19 +1427,18 @@ xfs_dialloc_ag_update_inobt(
* The caller selected an AG for us, and made sure that free inodes are
* available.
*/
-STATIC int
+static int
xfs_dialloc_ag(
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_ino_t parent,
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
- struct xfs_perag *pag;
struct xfs_btree_cur *cur; /* finobt cursor */
struct xfs_btree_cur *icur; /* inobt cursor */
struct xfs_inobt_rec_incore rec;
@@ -1596,10 +1447,8 @@ xfs_dialloc_ag(
int offset;
int i;
- if (!xfs_sb_version_hasfinobt(&mp->m_sb))
- return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
-
- pag = xfs_perag_get(mp, agno);
+ if (!xfs_has_finobt(mp))
+ return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
@@ -1608,9 +1457,9 @@ xfs_dialloc_ag(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error_cur;
@@ -1619,7 +1468,7 @@ xfs_dialloc_ag(
* parent. If so, find the closest available inode to the parent. If
* not, consider the agi hint or find the first free inode in the AG.
*/
- if (agno == pagno)
+ if (pag->pag_agno == pagno)
error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
else
error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
@@ -1631,7 +1480,7 @@ xfs_dialloc_ag(
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
/*
* Modify or remove the finobt record.
@@ -1651,9 +1500,9 @@ xfs_dialloc_ag(
* the original freecount. If all is well, make the equivalent update to
* the inobt using the finobt record and offset information.
*/
- icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
- error = xfs_check_agi_freecount(icur, agi);
+ error = xfs_check_agi_freecount(icur);
if (error)
goto error_icur;
@@ -1671,16 +1520,15 @@ xfs_dialloc_ag(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
- error = xfs_check_agi_freecount(icur, agi);
+ error = xfs_check_agi_freecount(icur);
if (error)
goto error_icur;
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error_icur;
xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- xfs_perag_put(pag);
*inop = ino;
return 0;
@@ -1688,73 +1536,226 @@ error_icur:
xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
error_cur:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- xfs_perag_put(pag);
+ return error;
+}
+
+static int
+xfs_dialloc_roll(
+ struct xfs_trans **tpp,
+ struct xfs_buf *agibp)
+{
+ struct xfs_trans *tp = *tpp;
+ struct xfs_dquot_acct *dqinfo;
+ int error;
+
+ /*
+ * Hold to on to the agibp across the commit so no other allocation can
+ * come in and take the free inodes we just allocated for our caller.
+ */
+ xfs_trans_bhold(tp, agibp);
+
+ /*
+ * We want the quota changes to be associated with the next transaction,
+ * NOT this one. So, detach the dqinfo from this and attach it to the
+ * next transaction.
+ */
+ dqinfo = tp->t_dqinfo;
+ tp->t_dqinfo = NULL;
+
+ error = xfs_trans_roll(&tp);
+
+ /* Re-attach the quota info that we detached from prev trx. */
+ tp->t_dqinfo = dqinfo;
+
+ /*
+ * Join the buffer even on commit error so that the buffer is released
+ * when the caller cancels the transaction and doesn't have to handle
+ * this error case specially.
+ */
+ xfs_trans_bjoin(tp, agibp);
+ *tpp = tp;
+ return error;
+}
+
+static xfs_agnumber_t
+xfs_ialloc_next_ag(
+ xfs_mount_t *mp)
+{
+ xfs_agnumber_t agno;
+
+ spin_lock(&mp->m_agirotor_lock);
+ agno = mp->m_agirotor;
+ if (++mp->m_agirotor >= mp->m_maxagi)
+ mp->m_agirotor = 0;
+ spin_unlock(&mp->m_agirotor_lock);
+
+ return agno;
+}
+
+static bool
+xfs_dialloc_good_ag(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ umode_t mode,
+ int flags,
+ bool ok_alloc)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_extlen_t ineed;
+ xfs_extlen_t longest = 0;
+ int needspace;
+ int error;
+
+ if (!pag->pagi_inodeok)
+ return false;
+
+ if (!pag->pagi_init) {
+ error = xfs_ialloc_read_agi(pag, tp, NULL);
+ if (error)
+ return false;
+ }
+
+ if (pag->pagi_freecount)
+ return true;
+ if (!ok_alloc)
+ return false;
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_read_agf(pag, tp, flags, NULL);
+ if (error)
+ return false;
+ }
+
+ /*
+ * Check that there is enough free space for the file plus a chunk of
+ * inodes if we need to allocate some. If this is the first pass across
+ * the AGs, take into account the potential space needed for alignment
+ * of inode chunks when checking the longest contiguous free space in
+ * the AG - this prevents us from getting ENOSPC because we have free
+ * space larger than ialloc_blks but alignment constraints prevent us
+ * from using it.
+ *
+ * If we can't find an AG with space for full alignment slack to be
+ * taken into account, we must be near ENOSPC in all AGs. Hence we
+ * don't include alignment for the second pass and so if we fail
+ * allocation due to alignment issues then it is most likely a real
+ * ENOSPC condition.
+ *
+ * XXX(dgc): this calculation is now bogus thanks to the per-ag
+ * reservations that xfs_alloc_fix_freelist() now does via
+ * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
+ * be more than large enough for the check below to succeed, but
+ * xfs_alloc_space_available() will fail because of the non-zero
+ * metadata reservation and hence we won't actually be able to allocate
+ * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
+ * because of this.
+ */
+ ineed = M_IGEO(mp)->ialloc_min_blks;
+ if (flags && ineed > 1)
+ ineed += M_IGEO(mp)->cluster_align;
+ longest = pag->pagf_longest;
+ if (!longest)
+ longest = pag->pagf_flcount > 0;
+ needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+
+ if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
+ return false;
+ return true;
+}
+
+static int
+xfs_dialloc_try_ag(
+ struct xfs_trans **tpp,
+ struct xfs_perag *pag,
+ xfs_ino_t parent,
+ xfs_ino_t *new_ino,
+ bool ok_alloc)
+{
+ struct xfs_buf *agbp;
+ xfs_ino_t ino;
+ int error;
+
+ /*
+ * Then read in the AGI buffer and recheck with the AGI buffer
+ * lock held.
+ */
+ error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
+ if (error)
+ return error;
+
+ if (!pag->pagi_freecount) {
+ if (!ok_alloc) {
+ error = -EAGAIN;
+ goto out_release;
+ }
+
+ error = xfs_ialloc_ag_alloc(*tpp, agbp, pag);
+ if (error < 0)
+ goto out_release;
+
+ /*
+ * We successfully allocated space for an inode cluster in this
+ * AG. Roll the transaction so that we can allocate one of the
+ * new inodes.
+ */
+ ASSERT(pag->pagi_freecount > 0);
+ error = xfs_dialloc_roll(tpp, agbp);
+ if (error)
+ goto out_release;
+ }
+
+ /* Allocate an inode in the found AG */
+ error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino);
+ if (!error)
+ *new_ino = ino;
+ return error;
+
+out_release:
+ xfs_trans_brelse(*tpp, agbp);
return error;
}
/*
- * Allocate an inode on disk.
- *
- * Mode is used to tell whether the new inode will need space, and whether it
- * is a directory.
+ * Allocate an on-disk inode.
*
- * This function is designed to be called twice if it has to do an allocation
- * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
- * If an inode is available without having to performn an allocation, an inode
- * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
- * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
- * The caller should then commit the current transaction, allocate a
- * new transaction, and call xfs_dialloc() again, passing in the previous value
- * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
- * buffer is locked across the two calls, the second call is guaranteed to have
- * a free inode available.
- *
- * Once we successfully pick an inode its number is returned and the on-disk
- * data structures are updated. The inode itself is not read in, since doing so
- * would break ordering constraints with xfs_reclaim.
+ * Mode is used to tell whether the new inode is a directory and hence where to
+ * locate it. The on-disk inode that is allocated will be returned in @new_ino
+ * on success, otherwise an error will be set to indicate the failure (e.g.
+ * -ENOSPC).
*/
int
xfs_dialloc(
- struct xfs_trans *tp,
+ struct xfs_trans **tpp,
xfs_ino_t parent,
umode_t mode,
- struct xfs_buf **IO_agbp,
- xfs_ino_t *inop)
+ xfs_ino_t *new_ino)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *agbp;
+ struct xfs_mount *mp = (*tpp)->t_mountp;
xfs_agnumber_t agno;
- int error;
- int ialloced;
- int noroom = 0;
+ int error = 0;
xfs_agnumber_t start_agno;
struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
- int okalloc = 1;
-
- if (*IO_agbp) {
- /*
- * If the caller passes in a pointer to the AGI buffer,
- * continue where we left off before. In this case, we
- * know that the allocation group has free inodes.
- */
- agbp = *IO_agbp;
- goto out_alloc;
- }
+ bool ok_alloc = true;
+ int flags;
+ xfs_ino_t ino;
/*
- * We do not have an agbp, so select an initial allocation
- * group for inode allocation.
+ * Directories, symlinks, and regular files frequently allocate at least
+ * one block, so factor that potential expansion when we examine whether
+ * an AG has enough space for file creation.
*/
- start_agno = xfs_ialloc_ag_select(tp, parent, mode);
- if (start_agno == NULLAGNUMBER) {
- *inop = NULLFSINO;
- return 0;
+ if (S_ISDIR(mode))
+ start_agno = xfs_ialloc_next_ag(mp);
+ else {
+ start_agno = XFS_INO_TO_AGNO(mp, parent);
+ if (start_agno >= mp->m_maxagi)
+ start_agno = 0;
}
/*
* If we have already hit the ceiling of inode blocks then clear
- * okalloc so we scan all available agi structures for a free
+ * ok_alloc so we scan all available agi structures for a free
* inode.
*
* Read rough value of mp->m_icount by percpu_counter_read_positive,
@@ -1763,8 +1764,7 @@ xfs_dialloc(
if (igeo->maxicount &&
percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
> igeo->maxicount) {
- noroom = 1;
- okalloc = 0;
+ ok_alloc = false;
}
/*
@@ -1773,85 +1773,34 @@ xfs_dialloc(
* allocation groups upward, wrapping at the end.
*/
agno = start_agno;
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
for (;;) {
pag = xfs_perag_get(mp, agno);
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto nextag;
+ if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) {
+ error = xfs_dialloc_try_ag(tpp, pag, parent,
+ &ino, ok_alloc);
+ if (error != -EAGAIN)
+ break;
}
- if (!pag->pagi_init) {
- error = xfs_ialloc_pagi_init(mp, tp, agno);
- if (error)
- goto out_error;
- }
-
- /*
- * Do a first racy fast path check if this AG is usable.
- */
- if (!pag->pagi_freecount && !okalloc)
- goto nextag;
-
- /*
- * Then read in the AGI buffer and recheck with the AGI buffer
- * lock held.
- */
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- if (error)
- goto out_error;
-
- if (pag->pagi_freecount) {
- xfs_perag_put(pag);
- goto out_alloc;
- }
-
- if (!okalloc)
- goto nextag_relse_buffer;
-
-
- error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
- if (error) {
- xfs_trans_brelse(tp, agbp);
-
- if (error != -ENOSPC)
- goto out_error;
-
- xfs_perag_put(pag);
- *inop = NULLFSINO;
- return 0;
- }
-
- if (ialloced) {
- /*
- * We successfully allocated some inodes, return
- * the current context to the caller so that it
- * can commit the current transaction and call
- * us again where we left off.
- */
- ASSERT(pag->pagi_freecount > 0);
- xfs_perag_put(pag);
-
- *IO_agbp = agbp;
- *inop = NULLFSINO;
- return 0;
+ if (xfs_is_shutdown(mp)) {
+ error = -EFSCORRUPTED;
+ break;
}
-
-nextag_relse_buffer:
- xfs_trans_brelse(tp, agbp);
-nextag:
- xfs_perag_put(pag);
- if (++agno == mp->m_sb.sb_agcount)
+ if (++agno == mp->m_maxagi)
agno = 0;
if (agno == start_agno) {
- *inop = NULLFSINO;
- return noroom ? -ENOSPC : 0;
+ if (!flags) {
+ error = -ENOSPC;
+ break;
+ }
+ flags = 0;
}
+ xfs_perag_put(pag);
}
-out_alloc:
- *IO_agbp = NULL;
- return xfs_dialloc_ag(tp, agbp, parent, inop);
-out_error:
+ if (!error)
+ *new_ino = ino;
xfs_perag_put(pag);
return error;
}
@@ -1878,7 +1827,7 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks,
&XFS_RMAP_OINFO_INODES);
return;
@@ -1923,7 +1872,7 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
+ xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
contigblk, &XFS_RMAP_OINFO_INODES);
/* reset range to current bit and carry on... */
@@ -1939,13 +1888,12 @@ xfs_difree_inobt(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
- struct xfs_perag *pag;
+ struct xfs_agi *agi = agbp->b_addr;
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int ilen;
@@ -1959,9 +1907,9 @@ xfs_difree_inobt(
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
@@ -2004,11 +1952,13 @@ xfs_difree_inobt(
* remove the chunk if the block size is large enough for multiple inode
* chunks (that might not be free).
*/
- if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- rec.ir_free == XFS_INOBT_ALL_FREE &&
+ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ struct xfs_perag *pag = agbp->b_pag;
+
xic->deleted = true;
- xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
@@ -2020,10 +1970,8 @@ xfs_difree_inobt(
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
- pag = xfs_perag_get(mp, agno);
pag->pagi_freecount -= ilen - 1;
pag->pagi_count -= ilen;
- xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
@@ -2033,7 +1981,7 @@ xfs_difree_inobt(
goto error0;
}
- xfs_difree_inode_chunk(tp, agno, &rec);
+ xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
} else {
xic->deleted = false;
@@ -2044,18 +1992,16 @@ xfs_difree_inobt(
goto error0;
}
- /*
+ /*
* Change the inode free counts and log the ag/sb changes.
*/
be32_add_cpu(&agi->agi_freecount, 1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- pag = xfs_perag_get(mp, agno);
pag->pagi_freecount++;
- xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error0;
@@ -2076,18 +2022,17 @@ xfs_difree_finobt(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
+ struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
- xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int offset = agino - ibtrec->ir_startino;
int error;
int i;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
@@ -2151,9 +2096,8 @@ xfs_difree_finobt(
* enough for multiple chunks. Leave the finobt record to remain in sync
* with the inobt.
*/
- if (rec.ir_free == XFS_INOBT_ALL_FREE &&
- mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
- !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
error = xfs_btree_delete(cur, &i);
if (error)
goto error;
@@ -2165,7 +2109,7 @@ xfs_difree_finobt(
}
out:
- error = xfs_check_agi_freecount(cur, agi);
+ error = xfs_check_agi_freecount(cur);
if (error)
goto error;
@@ -2185,36 +2129,33 @@ error:
*/
int
xfs_difree(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- struct xfs_icluster *xic) /* cluster info if deleted */
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_ino_t inode,
+ struct xfs_icluster *xic)
{
/* REFERENCED */
xfs_agblock_t agbno; /* block number containing inode */
struct xfs_buf *agbp; /* buffer for allocation group header */
xfs_agino_t agino; /* allocation group inode number */
- xfs_agnumber_t agno; /* allocation group number */
int error; /* error return value */
- struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_inobt_rec_incore rec;/* btree record */
- mp = tp->t_mountp;
-
/*
* Break up inode number into its components.
*/
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
- __func__, agno, mp->m_sb.sb_agcount);
+ if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
+ xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
+ __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
ASSERT(0);
return -EINVAL;
}
agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
__func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
ASSERT(0);
return -EINVAL;
}
@@ -2228,7 +2169,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error) {
xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
__func__, error);
@@ -2238,15 +2179,15 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, xic, &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec);
if (error)
goto error0;
/*
* Fix up the free inode btree.
*/
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
- error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ if (xfs_has_finobt(mp)) {
+ error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec);
if (error)
goto error0;
}
@@ -2261,7 +2202,7 @@ STATIC int
xfs_imap_lookup(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agino_t agino,
xfs_agblock_t agbno,
xfs_agblock_t *chunk_agbno,
@@ -2274,11 +2215,11 @@ xfs_imap_lookup(
int error;
int i;
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
- __func__, error, agno);
+ __func__, error, pag->pag_agno);
return error;
}
@@ -2288,7 +2229,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -2322,42 +2263,44 @@ xfs_imap_lookup(
*/
int
xfs_imap(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t ino, /* inode to locate */
- struct xfs_imap *imap, /* location map structure */
- uint flags) /* flags for inode btree lookup */
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t ino, /* inode to locate */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
{
- xfs_agblock_t agbno; /* block number of inode in the alloc group */
- xfs_agino_t agino; /* inode number within alloc group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agblock_t chunk_agbno; /* first block in inode chunk */
- xfs_agblock_t cluster_agbno; /* first block in inode cluster */
- int error; /* error code */
- int offset; /* index of inode in its buffer */
- xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
+ xfs_agblock_t agbno; /* block number of inode in the alloc group */
+ xfs_agino_t agino; /* inode number within alloc group */
+ xfs_agblock_t chunk_agbno; /* first block in inode chunk */
+ xfs_agblock_t cluster_agbno; /* first block in inode cluster */
+ int error; /* error code */
+ int offset; /* index of inode in its buffer */
+ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
+ struct xfs_perag *pag;
ASSERT(ino != NULLFSINO);
/*
* Split up the inode number into its parts.
*/
- agno = XFS_INO_TO_AGNO(mp, ino);
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
- ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ if (!pag || agbno >= mp->m_sb.sb_agblocks ||
+ ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ error = -EINVAL;
#ifdef DEBUG
/*
* Don't output diagnostic information for untrusted inodes
* as they can be invalid without implying corruption.
*/
if (flags & XFS_IGET_UNTRUSTED)
- return -EINVAL;
- if (agno >= mp->m_sb.sb_agcount) {
+ goto out_drop;
+ if (!pag) {
xfs_alert(mp,
"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
- __func__, agno, mp->m_sb.sb_agcount);
+ __func__, XFS_INO_TO_AGNO(mp, ino),
+ mp->m_sb.sb_agcount);
}
if (agbno >= mp->m_sb.sb_agblocks) {
xfs_alert(mp,
@@ -2365,15 +2308,15 @@ xfs_imap(
__func__, (unsigned long long)agbno,
(unsigned long)mp->m_sb.sb_agblocks);
}
- if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
xfs_alert(mp,
"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
__func__, ino,
- XFS_AGINO_TO_INO(mp, agno, agino));
+ XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
}
xfs_stack_trace();
#endif /* DEBUG */
- return -EINVAL;
+ goto out_drop;
}
/*
@@ -2384,10 +2327,10 @@ xfs_imap(
* in all cases where an untrusted inode number is passed.
*/
if (flags & XFS_IGET_UNTRUSTED) {
- error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
&chunk_agbno, &offset_agbno, flags);
if (error)
- return error;
+ goto out_drop;
goto out_map;
}
@@ -2399,11 +2342,12 @@ xfs_imap(
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
imap->im_len = XFS_FSB_TO_BB(mp, 1);
imap->im_boffset = (unsigned short)(offset <<
mp->m_sb.sb_inodelog);
- return 0;
+ error = 0;
+ goto out_drop;
}
/*
@@ -2415,10 +2359,10 @@ xfs_imap(
offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
- error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
&chunk_agbno, &offset_agbno, flags);
if (error)
- return error;
+ goto out_drop;
}
out_map:
@@ -2429,7 +2373,7 @@ out_map:
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
@@ -2446,9 +2390,14 @@ out_map:
__func__, (unsigned long long) imap->im_blkno,
(unsigned long long) imap->im_len,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
- return -EINVAL;
+ error = -EINVAL;
+ goto out_drop;
}
- return 0;
+ error = 0;
+out_drop:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2465,9 +2414,9 @@ out_map:
*/
void
xfs_ialloc_log_agi(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* allocation group header buffer */
- int fields) /* bitmask of fields to log */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte number */
int last; /* last byte number */
@@ -2486,12 +2435,12 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_unlinked),
offsetof(xfs_agi_t, agi_free_root),
offsetof(xfs_agi_t, agi_free_level),
+ offsetof(xfs_agi_t, agi_iblocks),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
- xfs_agi_t *agi; /* allocation group header */
+ struct xfs_agi *agi = bp->b_addr;
- agi = XFS_BUF_TO_AGI(bp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
@@ -2523,14 +2472,13 @@ xfs_agi_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ struct xfs_agi *agi = bp->b_addr;
int i;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (!xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
return __this_address;
}
@@ -2543,12 +2491,12 @@ xfs_agi_verify(
return __this_address;
if (be32_to_cpu(agi->agi_level) < 1 ||
- be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+ be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ if (xfs_has_finobt(mp) &&
(be32_to_cpu(agi->agi_free_level) < 1 ||
- be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS))
+ be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
return __this_address;
/*
@@ -2577,7 +2525,7 @@ xfs_agi_read_verify(
struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (xfs_has_crc(mp) &&
!xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
@@ -2593,6 +2541,7 @@ xfs_agi_write_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agi *agi = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_agi_verify(bp);
@@ -2601,11 +2550,11 @@ xfs_agi_write_verify(
return;
}
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (bip)
- XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
}
@@ -2622,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = {
*/
int
xfs_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* allocation group hdr buf */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
{
+ struct xfs_mount *mp = pag->pag_mount;
int error;
- trace_xfs_read_agi(mp, agno);
+ trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
if (error)
return error;
if (tp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
+ xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
- xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+ xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
return 0;
}
+/*
+ * Read in the agi and initialise the per-ag data. If the caller supplies a
+ * @agibpp, return the locked AGI buffer to them, otherwise release it.
+ */
int
xfs_ialloc_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* allocation group hdr buf */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
{
- struct xfs_agi *agi; /* allocation group header */
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
int error;
- trace_xfs_ialloc_read_agi(mp, agno);
+ trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
- error = xfs_read_agi(mp, tp, agno, bpp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(*bpp);
- pag = xfs_perag_get(mp, agno);
+ agi = agibp->b_addr;
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -2674,28 +2624,11 @@ xfs_ialloc_read_agi(
* we are in the middle of a forced shutdown.
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- xfs_perag_put(pag);
- return 0;
-}
-
-/*
- * Read in the agi to initialise the per-ag data in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno) /* allocation group number */
-{
- xfs_buf_t *bp = NULL;
- int error;
-
- error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
- if (error)
- return error;
- if (bp)
- xfs_trans_brelse(tp, bp);
+ xfs_is_shutdown(pag->pag_mount));
+ if (agibpp)
+ *agibpp = agibp;
+ else
+ xfs_trans_brelse(tp, agibp);
return 0;
}
@@ -2765,7 +2698,7 @@ struct xfs_ialloc_count_inodes {
STATIC int
xfs_ialloc_count_inodes_rec(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
struct xfs_inobt_rec_incore irec;
@@ -2821,6 +2754,12 @@ xfs_ialloc_setup_geometry(
uint64_t icount;
uint inodes;
+ igeo->new_diflags2 = 0;
+ if (xfs_has_bigtime(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ if (xfs_has_large_extent_counts(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
+
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
@@ -2841,6 +2780,7 @@ xfs_ialloc_setup_geometry(
inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
inodes);
+ ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
/*
* Set the maximum inode count for this filesystem, being careful not
@@ -2873,7 +2813,7 @@ xfs_ialloc_setup_geometry(
* cannot change the behavior.
*/
igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_v3inodes(mp)) {
int new_size = igeo->inode_cluster_size_raw;
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
@@ -2891,7 +2831,7 @@ xfs_ialloc_setup_geometry(
igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
/* Calculate inode cluster alignment. */
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ if (xfs_has_align(mp) &&
mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
igeo->cluster_align = mp->m_sb.sb_inoalignmt;
else
@@ -2939,15 +2879,15 @@ xfs_ialloc_calc_rootino(
first_bno += xfs_alloc_min_freelist(mp, NULL);
/* ...the free inode btree root... */
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
first_bno++;
/* ...the reverse mapping btree root... */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
first_bno++;
/* ...the reference count btree... */
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
first_bno++;
/*
@@ -2957,19 +2897,73 @@ xfs_ialloc_calc_rootino(
* allocation group, or very odd geometries created by old mkfs
* versions on very small filesystems.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+ if (xfs_ag_contains_log(mp, 0))
first_bno += mp->m_sb.sb_logblocks;
/*
* Now round first_bno up to whatever allocation alignment is given
* by the filesystem or was passed in.
*/
- if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0)
+ if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
first_bno = roundup(first_bno, sunit);
- else if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ else if (xfs_has_align(mp) &&
mp->m_sb.sb_inoalignmt > 1)
first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
}
+
+/*
+ * Ensure there are not sparse inode clusters that cross the new EOAG.
+ *
+ * This is a no-op for non-spinode filesystems since clusters are always fully
+ * allocated and checking the bnobt suffices. However, a spinode filesystem
+ * could have a record where the upper inodes are free blocks. If those blocks
+ * were removed from the filesystem, the inode record would extend beyond EOAG,
+ * which will be flagged as corruption.
+ */
+int
+xfs_ialloc_check_shrink(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ xfs_agblock_t new_length)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length);
+ int has;
+ int error;
+
+ if (!xfs_has_sparseinodes(mp))
+ return 0;
+
+ pag = xfs_perag_get(mp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO);
+
+ /* Look up the inobt record that would correspond to the new EOFS. */
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
+ if (error || !has)
+ goto out;
+
+ error = xfs_inobt_get_rec(cur, &rec, &has);
+ if (error)
+ goto out;
+
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* If the record covers inodes that would be beyond EOFS, bail out. */
+ if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
+ error = -ENOSPC;
+ goto out;
+ }
+out:
+ xfs_btree_del_cursor(cur, error);
+ xfs_perag_put(pag);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 72b3468b97b1..9bbbca6ac4ed 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -33,46 +33,14 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
}
/*
- * Allocate an inode on disk.
- * Mode is used to tell whether the new inode will need space, and whether
- * it is a directory.
- *
- * To work within the constraint of one allocation per transaction,
- * xfs_dialloc() is designed to be called twice if it has to do an
- * allocation to make more free inodes. If an inode is
- * available without an allocation, agbp would be set to the current
- * agbp and alloc_done set to false.
- * If an allocation needed to be done, agbp would be set to the
- * inode header of the allocation group and alloc_done set to true.
- * The caller should then commit the current transaction and allocate a new
- * transaction. xfs_dialloc() should then be called again with
- * the agbp value returned from the previous call.
- *
- * Once we successfully pick an inode its number is returned and the
- * on-disk data structures are updated. The inode itself is not read
- * in, since doing so would break ordering constraints with xfs_reclaim.
- *
- * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ * Allocate an inode on disk. Mode is used to tell whether the new inode will
+ * need space, and whether it is a directory.
*/
-int /* error */
-xfs_dialloc(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_ino_t parent, /* parent inode (directory) */
- umode_t mode, /* mode bits for new inode */
- struct xfs_buf **agbp, /* buf for a.g. inode header */
- xfs_ino_t *inop); /* inode number allocated */
+int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+ xfs_ino_t *new_ino);
-/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int /* error */
-xfs_difree(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- struct xfs_icluster *ifree); /* cluster info if deleted */
+int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
+ xfs_ino_t ino, struct xfs_icluster *ifree);
/*
* Return the location of the inode in imap, for mapping it into a buffer.
@@ -92,27 +60,12 @@ void
xfs_ialloc_log_agi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* allocation group header buffer */
- int fields); /* bitmask of fields to log */
+ uint32_t fields); /* bitmask of fields to log */
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int /* error */
-xfs_ialloc_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp); /* allocation group hdr buf */
-
-/*
- * Read in the allocation group header to initialise the per-ag data
- * in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno); /* allocation group number */
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
+int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
/*
* Lookup a record by ino in the btree given by cur.
@@ -134,11 +87,10 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
-int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
union xfs_btree_rec;
-void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec,
+void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
+ const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
@@ -154,4 +106,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
+int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf *agibp, xfs_agblock_t new_length);
+
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b82992f795aa..8c83e265770c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -12,6 +12,7 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
@@ -19,7 +20,9 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
+static struct kmem_cache *xfs_inobt_cur_cache;
STATIC int
xfs_inobt_get_minrecs(
@@ -34,18 +37,17 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno,
- cur->bc_btnum);
+ cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
}
STATIC void
xfs_inobt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *nptr,
- int inc) /* level change */
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr,
+ int inc) /* level change */
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
agi->agi_root = nptr->s;
be32_add_cpu(&agi->agi_level, inc);
@@ -54,12 +56,12 @@ xfs_inobt_set_root(
STATIC void
xfs_finobt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *nptr,
- int inc) /* level change */
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *nptr,
+ int inc) /* level change */
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
agi->agi_free_root = nptr->s;
be32_add_cpu(&agi->agi_free_level, inc);
@@ -67,13 +69,32 @@ xfs_finobt_set_root(
XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
}
+/* Update the inode btree block counter for this btree. */
+static inline void
+xfs_inobt_mod_blockcount(
+ struct xfs_btree_cur *cur,
+ int howmuch)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ if (!xfs_has_inobtcounts(cur->bc_mp))
+ return;
+
+ if (cur->bc_btnum == XFS_BTNUM_FINO)
+ be32_add_cpu(&agi->agi_fblocks, howmuch);
+ else if (cur->bc_btnum == XFS_BTNUM_INO)
+ be32_add_cpu(&agi->agi_iblocks, howmuch);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
+}
+
STATIC int
__xfs_inobt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat,
- enum xfs_ag_resv_type resv)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat,
+ enum xfs_ag_resv_type resv)
{
xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */
@@ -83,7 +104,7 @@ __xfs_inobt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.oinfo = XFS_RMAP_OINFO_INOBT;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
@@ -102,25 +123,26 @@ __xfs_inobt_alloc_block(
new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
+ xfs_inobt_mod_blockcount(cur, 1);
return 0;
}
STATIC int
xfs_inobt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
}
STATIC int
xfs_finobt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
@@ -134,8 +156,9 @@ __xfs_inobt_free_block(
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
+ xfs_inobt_mod_blockcount(cur, -1);
return xfs_free_extent(cur->bc_tp,
- XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
+ XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
&XFS_RMAP_OINFO_INOBT, resv);
}
@@ -167,18 +190,18 @@ xfs_inobt_get_maxrecs(
STATIC void
xfs_inobt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->inobt.ir_startino = rec->inobt.ir_startino;
}
STATIC void
xfs_inobt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- __u32 x;
+ __u32 x;
x = be32_to_cpu(rec->inobt.ir_startino);
x += XFS_INODES_PER_CHUNK - 1;
@@ -191,7 +214,7 @@ xfs_inobt_init_rec_from_cur(
union xfs_btree_rec *rec)
{
rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
- if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ if (xfs_has_sparseinodes(cur->bc_mp)) {
rec->inobt.ir_u.sp.ir_holemask =
cpu_to_be16(cur->bc_rec.i.ir_holemask);
rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
@@ -212,9 +235,9 @@ xfs_inobt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_root;
}
@@ -224,16 +247,16 @@ xfs_finobt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_free_root;
}
STATIC int64_t
xfs_inobt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
cur->bc_rec.i.ir_startino;
@@ -241,9 +264,9 @@ xfs_inobt_key_diff(
STATIC int64_t
xfs_inobt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
be32_to_cpu(k2->inobt.ir_startino);
@@ -271,7 +294,7 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
@@ -339,9 +362,9 @@ const struct xfs_buf_ops xfs_finobt_buf_ops = {
STATIC int
xfs_inobt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->inobt.ir_startino) <
be32_to_cpu(k2->inobt.ir_startino);
@@ -349,9 +372,9 @@ xfs_inobt_keys_inorder(
STATIC int
xfs_inobt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
be32_to_cpu(r2->inobt.ir_startino);
@@ -400,46 +423,123 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
};
/*
- * Allocate a new inode btree cursor.
+ * Initialize a new inode btree cursor.
*/
-struct xfs_btree_cur * /* new inode btree cursor */
-xfs_inobt_init_cursor(
+static struct xfs_btree_cur *
+xfs_inobt_init_common(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agi structure */
- xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_perag *pag,
xfs_btnum_t btnum) /* ialloc or free ino btree */
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
+ cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+ M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
if (btnum == XFS_BTNUM_INO) {
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- cur->bc_ops = &xfs_inobt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
+ cur->bc_ops = &xfs_inobt_ops;
} else {
- cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
- cur->bc_ops = &xfs_finobt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
+ cur->bc_ops = &xfs_finobt_ops;
}
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+ return cur;
+}
+
+/* Create an inode btree cursor. */
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ cur = xfs_inobt_init_common(mp, tp, pag, btnum);
+ if (btnum == XFS_BTNUM_INO)
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ else
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
+
+/* Create an inode btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_inobt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ cur = xfs_inobt_init_common(mp, NULL, pag, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
/*
+ * Install a new inobt btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_inobt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agi *agi = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+ int fields;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ if (cur->bc_btnum == XFS_BTNUM_INO) {
+ fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
+ agi->agi_root = cpu_to_be32(afake->af_root);
+ agi->agi_level = cpu_to_be32(afake->af_levels);
+ if (xfs_has_inobtcounts(cur->bc_mp)) {
+ agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ } else {
+ fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
+ agi->agi_free_root = cpu_to_be32(afake->af_root);
+ agi->agi_free_level = cpu_to_be32(afake->af_levels);
+ if (xfs_has_inobtcounts(cur->bc_mp)) {
+ agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ }
+}
+
+/* Calculate number of records in an inode btree block. */
+static inline unsigned int
+xfs_inobt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
+
+/*
* Calculate number of records in an inobt btree block.
*/
int
@@ -449,10 +549,54 @@ xfs_inobt_maxrecs(
int leaf)
{
blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+ return xfs_inobt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(xfs_inobt_rec_t);
- return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+/*
+ * Maximum number of inode btree records per AG. Pretend that we can fill an
+ * entire AG completely full of inodes except for the AG headers.
+ */
+#define XFS_MAX_INODE_RECORDS \
+ ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \
+ XFS_INODES_PER_CHUNK
+
+/* Compute the max possible height for the inode btree. */
+static inline unsigned int
+xfs_inobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN,
+ XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN);
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for the free inode btree. */
+static inline unsigned int
+xfs_finobt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS);
+}
+
+/* Compute the max possible height for either inode btree. */
+unsigned int
+xfs_iallocbt_maxlevels_ondisk(void)
+{
+ return max(xfs_inobt_maxlevels_ondisk(),
+ xfs_finobt_maxlevels_ondisk());
}
/*
@@ -539,10 +683,10 @@ xfs_inobt_rec_check_count(
static xfs_extlen_t
xfs_inobt_max_size(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
- xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
+ struct xfs_mount *mp = pag->pag_mount;
+ xfs_agblock_t agblocks = pag->block_count;
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (M_IGEO(mp)->inobt_mxr[0] == 0)
@@ -553,8 +697,7 @@ xfs_inobt_max_size(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
@@ -567,7 +710,7 @@ int
xfs_inobt_cur(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t which,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp)
@@ -578,16 +721,11 @@ xfs_inobt_cur(
ASSERT(*agi_bpp == NULL);
ASSERT(*curpp == NULL);
- error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
if (error)
return error;
- cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which);
- if (!cur) {
- xfs_trans_brelse(tp, *agi_bpp);
- *agi_bpp = NULL;
- return -ENOMEM;
- }
+ cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which);
*curpp = cur;
return 0;
}
@@ -596,7 +734,7 @@ static int
xfs_inobt_count_blocks(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
@@ -604,7 +742,7 @@ xfs_inobt_count_blocks(
struct xfs_btree_cur *cur = NULL;
int error;
- error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp);
+ error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp);
if (error)
return error;
@@ -615,6 +753,27 @@ xfs_inobt_count_blocks(
return error;
}
+/* Read finobt block count from AGI header. */
+static int
+xfs_finobt_read_blocks(
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agi *agi;
+ int error;
+
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
+ if (error)
+ return error;
+
+ agi = agbp->b_addr;
+ *tree_blocks = be32_to_cpu(agi->agi_fblocks);
+ xfs_trans_brelse(tp, agbp);
+ return 0;
+}
+
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
@@ -622,21 +781,25 @@ int
xfs_finobt_calc_reserves(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
xfs_extlen_t tree_len = 0;
int error;
- if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (!xfs_has_finobt(mp))
return 0;
- error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len);
+ if (xfs_has_inobtcounts(mp))
+ error = xfs_finobt_read_blocks(pag, tp, &tree_len);
+ else
+ error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
+ &tree_len);
if (error)
return error;
- *ask += xfs_inobt_max_size(mp, agno);
+ *ask += xfs_inobt_max_size(pag);
*used += tree_len;
return 0;
}
@@ -649,3 +812,22 @@ xfs_iallocbt_calc_size(
{
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
}
+
+int __init
+xfs_inobt_init_cur_cache(void)
+{
+ xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur",
+ xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_inobt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_inobt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_inobt_cur_cache);
+ xfs_inobt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 951305ecaae1..26451cb76b98 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -13,12 +13,13 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xfs_perag;
/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_INOBT_BLOCK_LEN(mp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+ (xfs_has_crc(((mp))) ? \
XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
/*
@@ -45,9 +46,12 @@ struct xfs_mount;
(maxrecs) * sizeof(xfs_inobt_key_t) + \
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
- xfs_btnum_t);
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfs_buf *agbp,
+ struct xfs_perag *pag, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag,
+ xfs_btnum_t btnum);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -61,11 +65,19 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
#endif /* DEBUG */
int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+ struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_btnum_t btnum,
+ struct xfs_perag *pag, xfs_btnum_t btnum,
struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
+void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_iallocbt_maxlevels_ondisk(void);
+
+int __init xfs_inobt_init_cur_cache(void);
+void xfs_inobt_destroy_cur_cache(void);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 52451809c478..773cf4349428 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -8,9 +8,9 @@
#include "xfs_format.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_inode.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_trace.h"
/*
@@ -603,7 +603,7 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
+ new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
ifp->if_u1.if_root = new;
cur->leaf = new;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8afacfe4be0a..758aacd8166b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_inode.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
@@ -21,41 +22,6 @@
#include <linux/iversion.h>
/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
- */
-#if defined(DEBUG)
-void
-xfs_inobp_check(
- xfs_mount_t *mp,
- xfs_buf_t *bp)
-{
- int i;
- xfs_dinode_t *dip;
-
- for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
- dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
- if (!dip->di_next_unlinked) {
- xfs_alert(mp,
- "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
- i, (long long)bp->b_bn);
- }
- }
-}
-#endif
-
-bool
-xfs_dinode_good_version(
- struct xfs_mount *mp,
- __u8 version)
-{
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return version == 3;
-
- return version == 1 || version == 2;
-}
-
-/*
* If we are doing readahead on an inode buffer, we might be in log recovery
* reading an inode allocation buffer that hasn't yet been replayed, and hence
* has not had the inode cores stamped into it. Hence for readahead, the buffer
@@ -64,10 +30,10 @@ xfs_dinode_good_version(
* If the readahead buffer is invalid, we need to mark it with an error and
* clear the DONE status of the buffer so that a followup read will re-read it
* from disk. We don't report the error otherwise to avoid warnings during log
- * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * recovery and we don't get unnecessary panics on debug kernels. We use EIO here
* because all we want to do is say readahead failed; there is no-one to report
* the error to, so this will distinguish it from a non-ra verifier failure.
- * Changes to this readahead error behavour also need to be reflected in
+ * Changes to this readahead error behaviour also need to be reflected in
* xfs_dquot_buf_readahead_verify().
*/
static void
@@ -76,25 +42,23 @@ xfs_inode_buf_verify(
bool readahead)
{
struct xfs_mount *mp = bp->b_mount;
- xfs_agnumber_t agno;
int i;
int ni;
/*
* Validate the magic number and version of every inode in the buffer
*/
- agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
- xfs_agino_t unlinked_ino;
+ struct xfs_dinode *dip;
+ xfs_agino_t unlinked_ino;
+ int di_ok;
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- xfs_verify_agino_or_null(mp, agno, unlinked_ino);
+ xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -106,7 +70,7 @@ xfs_inode_buf_verify(
#ifdef DEBUG
xfs_alert(mp,
"bad inode magic/vsn daddr %lld #%d (magic=%x)",
- (unsigned long long)bp->b_bn, i,
+ (unsigned long long)xfs_buf_daddr(bp), i,
be16_to_cpu(dip->di_magic));
#endif
xfs_buf_verifier_error(bp, -EFSCORRUPTED,
@@ -159,72 +123,96 @@ const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
/*
* This routine is called to map an inode to the buffer containing the on-disk
* version of the inode. It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
- * pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and dipp are
- * undefined.
+ * on-disk inode in the bpp parameter.
*/
int
xfs_imap_to_bp(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp,
- uint buf_flags,
- uint iget_flags)
+ struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
- int error;
-
- buf_flags |= XBF_UNMAPPED;
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- (int)imap->im_len, buf_flags, &bp,
+ return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ imap->im_len, XBF_UNMAPPED, bpp,
&xfs_inode_buf_ops);
- if (error) {
- if (error == -EAGAIN) {
- ASSERT(buf_flags & XBF_TRYLOCK);
- return error;
- }
- xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
- __func__, error);
- return error;
- }
+}
- *bpp = bp;
- *dipp = xfs_buf_offset(bp, imap->im_boffset);
- return 0;
+static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
+{
+ struct timespec64 tv;
+ uint32_t n;
+
+ tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n));
+ tv.tv_nsec = n;
+
+ return tv;
}
-void
+/* Convert an ondisk timestamp to an incore timestamp. */
+struct timespec64
+xfs_inode_from_disk_ts(
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+ struct xfs_legacy_timestamp *lts;
+
+ if (xfs_dinode_has_bigtime(dip))
+ return xfs_inode_decode_bigtime(be64_to_cpu(ts));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ tv.tv_sec = (int)be32_to_cpu(lts->t_sec);
+ tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec);
+
+ return tv;
+}
+
+int
xfs_inode_from_disk(
struct xfs_inode *ip,
struct xfs_dinode *from)
{
- struct xfs_icdinode *to = &ip->i_d;
struct inode *inode = VFS_I(ip);
+ int error;
+ xfs_failaddr_t fa;
+ ASSERT(ip->i_cowfp == NULL);
+
+ fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from,
+ sizeof(*from), fa);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * First get the permanent information that is needed to allocate an
+ * inode. If the inode is unused, mode is zero and we shouldn't mess
+ * with the uninitialized part of it.
+ */
+ if (!xfs_has_v3inodes(ip->i_mount))
+ ip->i_flushiter = be16_to_cpu(from->di_flushiter);
+ inode->i_generation = be32_to_cpu(from->di_gen);
+ inode->i_mode = be16_to_cpu(from->di_mode);
+ if (!inode->i_mode)
+ return 0;
/*
* Convert v1 inodes immediately to v2 inode format as this is the
* minimum inode version format we support in the rest of the code.
+ * They will also be unconditionally written back to disk as v2 inodes.
*/
- to->di_version = from->di_version;
- if (to->di_version == 1) {
+ if (unlikely(from->di_version == 1)) {
set_nlink(inode, be16_to_cpu(from->di_onlink));
- to->di_projid = 0;
- to->di_version = 2;
+ ip->i_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
- to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
+ ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
be16_to_cpu(from->di_projid_lo);
}
- to->di_format = from->di_format;
- to->di_uid = be32_to_cpu(from->di_uid);
- to->di_gid = be32_to_cpu(from->di_gid);
- to->di_flushiter = be16_to_cpu(from->di_flushiter);
+ i_uid_write(inode, be32_to_cpu(from->di_uid));
+ i_gid_write(inode, be32_to_cpu(from->di_gid));
/*
* Time is signed, so need to convert to signed 32 bit before
@@ -232,33 +220,80 @@ xfs_inode_from_disk(
* a time before epoch is converted to a time long after epoch
* on 64 bit systems.
*/
- inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
- inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
- inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
- inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
- inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
- inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
- inode->i_generation = be32_to_cpu(from->di_gen);
- inode->i_mode = be16_to_cpu(from->di_mode);
+ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
+ inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
+
+ ip->i_disk_size = be64_to_cpu(from->di_size);
+ ip->i_nblocks = be64_to_cpu(from->di_nblocks);
+ ip->i_extsize = be32_to_cpu(from->di_extsize);
+ ip->i_forkoff = from->di_forkoff;
+ ip->i_diflags = be16_to_cpu(from->di_flags);
+ ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
+
+ if (from->di_dmevmask || from->di_dmstate)
+ xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
- to->di_size = be64_to_cpu(from->di_size);
- to->di_nblocks = be64_to_cpu(from->di_nblocks);
- to->di_extsize = be32_to_cpu(from->di_extsize);
- to->di_nextents = be32_to_cpu(from->di_nextents);
- to->di_anextents = be16_to_cpu(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
- to->di_dmstate = be16_to_cpu(from->di_dmstate);
- to->di_flags = be16_to_cpu(from->di_flags);
-
- if (to->di_version == 3) {
+ if (xfs_has_v3inodes(ip->i_mount)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
- to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec);
- to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec);
- to->di_flags2 = be64_to_cpu(from->di_flags2);
- to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
+ ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
+ ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+ }
+
+ error = xfs_iformat_data_fork(ip, from);
+ if (error)
+ return error;
+ if (from->di_forkoff) {
+ error = xfs_iformat_attr_fork(ip, from);
+ if (error)
+ goto out_destroy_data_fork;
+ }
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+ return 0;
+
+out_destroy_data_fork:
+ xfs_idestroy_fork(&ip->i_df);
+ return error;
+}
+
+/* Convert an incore timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_inode_to_disk_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_timestamp *lts;
+ xfs_timestamp_t ts;
+
+ if (xfs_inode_has_bigtime(ip))
+ return cpu_to_be64(xfs_inode_encode_bigtime(tv));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lts->t_sec = cpu_to_be32(tv.tv_sec);
+ lts->t_nsec = cpu_to_be32(tv.tv_nsec);
+
+ return ts;
+}
+
+static inline void
+xfs_inode_to_disk_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af));
+ /*
+ * We might be upgrading the inode to use larger extent counters
+ * than was previously used. Hence zero the unused field.
+ */
+ to->di_nrext64_pad = cpu_to_be16(0);
+ } else {
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af));
}
}
@@ -268,107 +303,49 @@ xfs_inode_to_disk(
struct xfs_dinode *to,
xfs_lsn_t lsn)
{
- struct xfs_icdinode *from = &ip->i_d;
struct inode *inode = VFS_I(ip);
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
to->di_onlink = 0;
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff);
- to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
-
- memset(to->di_pad, 0, sizeof(to->di_pad));
- to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
- to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
- to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
- to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
- to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ to->di_format = xfs_ifork_format(&ip->i_df);
+ to->di_uid = cpu_to_be32(i_uid_read(inode));
+ to->di_gid = cpu_to_be32(i_gid_read(inode));
+ to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
+ to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
+
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
- to->di_size = cpu_to_be64(from->di_size);
- to->di_nblocks = cpu_to_be64(from->di_nblocks);
- to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
- to->di_dmstate = cpu_to_be16(from->di_dmstate);
- to->di_flags = cpu_to_be16(from->di_flags);
-
- if (from->di_version == 3) {
+ to->di_size = cpu_to_be64(ip->i_disk_size);
+ to->di_nblocks = cpu_to_be64(ip->i_nblocks);
+ to->di_extsize = cpu_to_be32(ip->i_extsize);
+ to->di_forkoff = ip->i_forkoff;
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
+ to->di_flags = cpu_to_be16(ip->i_diflags);
+
+ if (xfs_has_v3inodes(ip->i_mount)) {
+ to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec);
- to->di_flags2 = cpu_to_be64(from->di_flags2);
- to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+ to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
+ to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ to->di_version = 2;
+ to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
-}
-void
-xfs_log_dinode_to_disk(
- struct xfs_log_dinode *from,
- struct xfs_dinode *to)
-{
- to->di_magic = cpu_to_be16(from->di_magic);
- to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_onlink = 0;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_nlink = cpu_to_be32(from->di_nlink);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-
- to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
- to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
- to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-
- to->di_size = cpu_to_be64(from->di_size);
- to->di_nblocks = cpu_to_be64(from->di_nblocks);
- to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
- to->di_dmstate = cpu_to_be16(from->di_dmstate);
- to->di_flags = cpu_to_be16(from->di_flags);
- to->di_gen = cpu_to_be32(from->di_gen);
-
- if (from->di_version == 3) {
- to->di_changecount = cpu_to_be64(from->di_changecount);
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
- to->di_flags2 = cpu_to_be64(from->di_flags2);
- to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
- to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
- uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
- } else {
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
- }
+ xfs_inode_to_disk_iext_counters(ip, to);
}
static xfs_failaddr_t
@@ -377,20 +354,40 @@ xfs_dinode_verify_fork(
struct xfs_mount *mp,
int whichfork)
{
- uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t di_nextents;
+ xfs_extnum_t max_extents;
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
- switch (XFS_DFORK_FORMAT(dip, whichfork)) {
+ di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
case XFS_DINODE_FMT_LOCAL:
/*
- * no local regular files yet
+ * No local regular files yet.
*/
- if (whichfork == XFS_DATA_FORK) {
- if (S_ISREG(be16_to_cpu(dip->di_mode)))
- return __this_address;
- if (be64_to_cpu(dip->di_size) >
- XFS_DFORK_SIZE(dip, mp, whichfork))
- return __this_address;
- }
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
if (di_nextents)
return __this_address;
break;
@@ -399,12 +396,11 @@ xfs_dinode_verify_fork(
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
- if (whichfork == XFS_ATTR_FORK) {
- if (di_nextents > MAXAEXTNUM)
- return __this_address;
- } else if (di_nextents > MAXEXTNUM) {
+ max_extents = xfs_iext_max_nextents(
+ xfs_dinode_has_large_extent_counts(dip),
+ whichfork);
+ if (di_nextents > max_extents)
return __this_address;
- }
break;
default:
return __this_address;
@@ -417,7 +413,7 @@ xfs_dinode_verify_forkoff(
struct xfs_dinode *dip,
struct xfs_mount *mp)
{
- if (!XFS_DFORK_Q(dip))
+ if (!dip->di_forkoff)
return NULL;
switch (dip->di_format) {
@@ -428,7 +424,7 @@ xfs_dinode_verify_forkoff(
case XFS_DINODE_FMT_LOCAL: /* fall through ... */
case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
case XFS_DINODE_FMT_BTREE:
- if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3))
+ if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3))
return __this_address;
break;
default:
@@ -437,6 +433,24 @@ xfs_dinode_verify_forkoff(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip)) {
+ if (!xfs_has_large_extent_counts(mp))
+ return __this_address;
+ if (dip->di_nrext64_pad != 0)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (dip->di_v3_pad != 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -448,13 +462,16 @@ xfs_dinode_verify(
uint16_t flags;
uint64_t flags2;
uint64_t di_size;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ xfs_filblks_t nblocks;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return __this_address;
/* Verify v3 integrity information first */
if (dip->di_version >= 3) {
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_v3inodes(mp))
return __this_address;
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
@@ -478,10 +495,19 @@ xfs_dinode_verify(
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
return __this_address;
+ fa = xfs_dinode_verify_nrext64(mp, dip);
+ if (fa)
+ return fa;
+
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
/* Fork checks carried over from xfs_iformat_fork */
- if (mode &&
- be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
- be64_to_cpu(dip->di_nblocks))
+ if (mode && nextents + naextents > nblocks)
+ return __this_address;
+
+ if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
@@ -520,7 +546,7 @@ xfs_dinode_verify(
return __this_address;
}
- if (XFS_DFORK_Q(dip)) {
+ if (dip->di_forkoff) {
fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK);
if (fa)
return fa;
@@ -538,7 +564,7 @@ xfs_dinode_verify(
default:
return __this_address;
}
- if (dip->di_anextents)
+ if (naextents)
return __this_address;
}
@@ -556,7 +582,7 @@ xfs_dinode_verify(
/* don't allow reflink/cowextsize if we don't have reflink */
if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
- !xfs_sb_version_hasreflink(&mp->m_sb))
+ !xfs_has_reflink(mp))
return __this_address;
/* only regular files get reflink */
@@ -567,16 +593,17 @@ xfs_dinode_verify(
if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
return __this_address;
- /* don't let reflink and dax mix */
- if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX))
- return __this_address;
-
/* COW extent size hint validation */
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
mode, flags, flags2);
if (fa)
return fa;
+ /* bigtime iflag can only happen on bigtime filesystems */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_has_bigtime(mp))
+ return __this_address;
+
return NULL;
}
@@ -590,136 +617,26 @@ xfs_dinode_calc_crc(
if (dip->di_version < 3)
return;
- ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+ ASSERT(xfs_has_crc(mp));
crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF);
dip->di_crc = xfs_end_cksum(crc);
}
/*
- * Read the disk inode attributes into the in-core inode structure.
- *
- * For version 5 superblocks, if we are initialising a new inode and we are not
- * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
- * inode core with a random generation number. If we are keeping inodes around,
- * we need to read the inode cluster to get the existing generation number off
- * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
- * format) then log recovery is dependent on the di_flushiter field being
- * initialised from the current on-disk value and hence we must also read the
- * inode off disk.
- */
-int
-xfs_iread(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- uint iget_flags)
-{
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- xfs_failaddr_t fa;
- int error;
-
- /*
- * Fill in the location information in the in-core inode.
- */
- error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
- if (error)
- return error;
-
- /* shortcut IO on inode allocation if possible */
- if ((iget_flags & XFS_IGET_CREATE) &&
- xfs_sb_version_hascrc(&mp->m_sb) &&
- !(mp->m_flags & XFS_MOUNT_IKEEP)) {
- VFS_I(ip)->i_generation = prandom_u32();
- ip->i_d.di_version = 3;
- return 0;
- }
-
- /*
- * Get pointers to the on-disk inode and the buffer containing it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
- if (error)
- return error;
-
- /* even unallocated inodes are verified */
- fa = xfs_dinode_verify(mp, ip->i_ino, dip);
- if (fa) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip,
- sizeof(*dip), fa);
- error = -EFSCORRUPTED;
- goto out_brelse;
- }
-
- /*
- * If the on-disk inode is already linked to a directory
- * entry, copy all of the inode into the in-core inode.
- * xfs_iformat_fork() handles copying in the inode format
- * specific information.
- * Otherwise, just get the truly permanent information.
- */
- if (dip->di_mode) {
- xfs_inode_from_disk(ip, dip);
- error = xfs_iformat_fork(ip, dip);
- if (error) {
-#ifdef DEBUG
- xfs_alert(mp, "%s: xfs_iformat() returned error %d",
- __func__, error);
-#endif /* DEBUG */
- goto out_brelse;
- }
- } else {
- /*
- * Partial initialisation of the in-core inode. Just the bits
- * that xfs_ialloc won't overwrite or relies on being correct.
- */
- ip->i_d.di_version = dip->di_version;
- VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
- ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-
- /*
- * Make sure to pull in the mode here as well in
- * case the inode is released without being used.
- * This ensures that xfs_inactive() will see that
- * the inode is already free and not try to mess
- * with the uninitialized part of it.
- */
- VFS_I(ip)->i_mode = 0;
- }
-
- ASSERT(ip->i_d.di_version >= 2);
- ip->i_delayed_blks = 0;
-
- /*
- * Mark the buffer containing the inode as something to keep
- * around for a while. This helps to keep recently accessed
- * meta-data in-core longer.
- */
- xfs_buf_set_ref(bp, XFS_INO_REF);
-
- /*
- * Use xfs_trans_brelse() to release the buffer containing the on-disk
- * inode, because it was acquired with xfs_trans_read_buf() in
- * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
- * brelse(). If we're within a transaction, then xfs_trans_brelse()
- * will only release the buffer if it is not dirty within the
- * transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be locking the
- * new in-core inode before putting it in the cache where other
- * processes can find it. Thus we don't have to worry about the inode
- * being changed just because we released the buffer.
- */
- out_brelse:
- xfs_trans_brelse(tp, bp);
- return error;
-}
-
-/*
* Validate di_extsize hint.
*
- * The rules are documented at xfs_ioctl_setattr_check_extsize().
- * These functions must be kept in sync with each other.
+ * 1. Extent size hint is only valid for directories and regular files.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files.
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. Hint cannot be larger than MAXTEXTLEN.
+ * 5. Can be changed on directories at any time.
+ * 6. Hint value of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * For realtime files, this is the rt extent size.
+ * 8. For non-realtime files, the extent size hint must be limited
+ * to half the AG size to avoid alignment extending the extent beyond the
+ * limits of the AG.
*/
xfs_failaddr_t
xfs_inode_validate_extsize(
@@ -739,8 +656,34 @@ xfs_inode_validate_extsize(
inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
extsize_bytes = XFS_FSB_TO_B(mp, extsize);
+ /*
+ * This comment describes a historic gap in this verifier function.
+ *
+ * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this
+ * function has never checked that the extent size hint is an integer
+ * multiple of the realtime extent size. Since we allow users to set
+ * this combination on non-rt filesystems /and/ to change the rt
+ * extent size when adding a rt device to a filesystem, the net effect
+ * is that users can configure a filesystem anticipating one rt
+ * geometry and change their minds later. Directories do not use the
+ * extent size hint, so this is harmless for them.
+ *
+ * If a directory with a misaligned extent size hint is allowed to
+ * propagate that hint into a new regular realtime file, the result
+ * is that the inode cluster buffer verifier will trigger a corruption
+ * shutdown the next time it is run, because the verifier has always
+ * enforced the alignment rule for regular files.
+ *
+ * Because we allow administrators to set a new rt extent size when
+ * adding a rt section, we cannot add a check to this verifier because
+ * that will result a new source of directory corruption errors when
+ * reading an existing filesystem. Instead, we rely on callers to
+ * decide when alignment checks are appropriate, and fix things up as
+ * needed.
+ */
+
if (rt_flag)
- blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
else
blocksize_bytes = mp->m_sb.sb_blocksize;
@@ -763,7 +706,7 @@ xfs_inode_validate_extsize(
if (extsize_bytes % blocksize_bytes)
return __this_address;
- if (extsize > MAXEXTLEN)
+ if (extsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
@@ -775,8 +718,15 @@ xfs_inode_validate_extsize(
/*
* Validate di_cowextsize hint.
*
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ * The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ * for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. Hint value of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ * alignment extending the extent beyond the limits of the AG.
*/
xfs_failaddr_t
xfs_inode_validate_cowextsize(
@@ -794,7 +744,7 @@ xfs_inode_validate_cowextsize(
hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
- if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb))
+ if (hint_flag && !xfs_has_reflink(mp))
return __this_address;
if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
@@ -813,7 +763,7 @@ xfs_inode_validate_cowextsize(
if (cowextsize_bytes % mp->m_sb.sb_blocksize)
return __this_address;
- if (cowextsize > MAXEXTLEN)
+ if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (cowextsize > mp->m_sb.sb_agblocks / 2)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index fd94b1078722..585ed5a110af 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -10,36 +10,6 @@ struct xfs_inode;
struct xfs_dinode;
/*
- * In memory representation of the XFS inode. This is held in the in-core struct
- * xfs_inode and represents the current on disk values but the structure is not
- * in on-disk format. That is, this structure is always translated to on-disk
- * format specific structures at the appropriate time.
- */
-struct xfs_icdinode {
- int8_t di_version; /* inode version */
- int8_t di_format; /* format of di_c data */
- uint16_t di_flushiter; /* incremented on flush */
- uint32_t di_uid; /* owner's user id */
- uint32_t di_gid; /* owner's group id */
- uint32_t di_projid; /* owner's project id */
- xfs_fsize_t di_size; /* number of bytes in file */
- xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
- xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
- uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
- int8_t di_aformat; /* format of attr fork's data */
- uint32_t di_dmevmask; /* DMIG event mask */
- uint16_t di_dmstate; /* DMIG state info */
- uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
-
- uint64_t di_flags2; /* more random flags */
- uint32_t di_cowextsize; /* basic cow extent size for file */
-
- struct timespec64 di_crtime; /* time created */
-};
-
-/*
* Inode location information. Stored in the inode and passed to
* xfs_imap_to_bp() to get a buffer and dinode for a given inode.
*/
@@ -49,25 +19,12 @@ struct xfs_imap {
unsigned short im_boffset; /* inode offset in block in bytes */
};
-int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
- struct xfs_imap *, struct xfs_dinode **,
- struct xfs_buf **, uint, uint);
-int xfs_iread(struct xfs_mount *, struct xfs_trans *,
- struct xfs_inode *, uint);
-void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_imap *imap, struct xfs_buf **bpp);
+void xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip);
void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
xfs_lsn_t lsn);
-void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
-void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
- struct xfs_dinode *to);
-
-bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version);
-
-#if defined(DEBUG)
-void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
+int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
@@ -77,4 +34,21 @@ xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
uint32_t cowextsize, uint16_t mode, uint16_t flags,
uint64_t flags2);
+static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
+{
+ return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec;
+}
+
+struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
+ const xfs_timestamp_t ts);
+
+static inline bool
+xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version)
+{
+ if (xfs_has_v3inodes(mp))
+ return version == 3;
+ return version == 1 || version == 2;
+}
+
+
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index ad2b9c313fd2..6b21760184d9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -23,112 +23,10 @@
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
+#include "xfs_types.h"
+#include "xfs_errortag.h"
-kmem_zone_t *xfs_ifork_zone;
-
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-
-/*
- * Copy inode type and data and attr format specific information from the
- * on-disk inode to the in-core inode and fork structures. For fifos, devices,
- * and sockets this means set i_rdev to the proper value. For files,
- * directories, and symlinks this means to bring in the in-line data or extent
- * pointers as well as the attribute fork. For a fork in B-tree format, only
- * the root is immediately brought in-core. The rest will be read in later when
- * first referenced (see xfs_iread_extents()).
- */
-int
-xfs_iformat_fork(
- struct xfs_inode *ip,
- struct xfs_dinode *dip)
-{
- struct inode *inode = VFS_I(ip);
- struct xfs_attr_shortform *atp;
- int size;
- int error = 0;
- xfs_fsize_t di_size;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFIFO:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFSOCK:
- ip->i_d.di_size = 0;
- inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
- break;
-
- case S_IFREG:
- case S_IFLNK:
- case S_IFDIR:
- switch (dip->di_format) {
- case XFS_DINODE_FMT_LOCAL:
- di_size = be64_to_cpu(dip->di_size);
- size = (int)di_size;
- error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
- break;
- default:
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
- dip, sizeof(*dip), __this_address);
- return -EFSCORRUPTED;
- }
- break;
-
- default:
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
- sizeof(*dip), __this_address);
- return -EFSCORRUPTED;
- }
- if (error)
- return error;
-
- if (xfs_is_reflink_inode(ip)) {
- ASSERT(ip->i_cowfp == NULL);
- xfs_ifork_init_cow(ip);
- }
-
- if (!XFS_DFORK_Q(dip))
- return 0;
-
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
-
- switch (dip->di_aformat) {
- case XFS_DINODE_FMT_LOCAL:
- atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
- size = be16_to_cpu(atp->hdr.totsize);
-
- error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
- break;
- default:
- xfs_inode_verifier_error(ip, error, __func__, dip,
- sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- break;
- }
- if (error) {
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- if (ip->i_cowfp)
- kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
- ip->i_cowfp = NULL;
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
- }
- return error;
-}
+struct kmem_cache *xfs_ifork_cache;
void
xfs_init_local_fork(
@@ -137,8 +35,8 @@ xfs_init_local_fork(
const void *data,
int64_t size)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int mem_size = size, real_size = 0;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int mem_size = size;
bool zero_terminate;
/*
@@ -152,8 +50,7 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -162,8 +59,6 @@ xfs_init_local_fork(
}
ifp->if_bytes = size;
- ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
- ifp->if_flags |= XFS_IFINLINE;
}
/*
@@ -171,10 +66,10 @@ xfs_init_local_fork(
*/
STATIC int
xfs_iformat_local(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork,
- int size)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork,
+ int size)
{
/*
* If the size is unreasonable, then something
@@ -183,7 +78,7 @@ xfs_iformat_local(
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+ "corrupt inode %llu (bad size %d for local fork, size = %zd).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
@@ -207,9 +102,9 @@ xfs_iformat_extents(
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
- int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
@@ -221,8 +116,8 @@ xfs_iformat_extents(
* we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
+ xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+ ip->i_ino, nex);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
@@ -253,7 +148,6 @@ xfs_iformat_extents(
xfs_iext_next(ifp, &icur);
}
}
- ifp->if_flags |= XFS_IFEXTENTS;
return 0;
}
@@ -267,8 +161,8 @@ xfs_iformat_extents(
*/
STATIC int
xfs_iformat_btree(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
@@ -279,7 +173,7 @@ xfs_iformat_btree(
int size;
int level;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
size = XFS_BMAP_BROOT_SPACE(mp, dfp);
nrecs = be16_to_cpu(dfp->bb_numrecs);
@@ -292,14 +186,13 @@ xfs_iformat_btree(
* or the number of extents is greater than the number of
* blocks.
*/
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
- XFS_IFORK_MAXEXT(ip, whichfork) ||
+ if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
nrecs == 0 ||
XFS_BMDR_SPACE_CALC(nrecs) >
XFS_DFORK_SIZE(dip, mp, whichfork) ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
- level == 0 || level > XFS_BTREE_MAXLEVELS) {
- xfs_warn(mp, "corrupt inode %Lu (btree).",
+ ifp->if_nextents > ip->i_nblocks) ||
+ level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
+ xfs_warn(mp, "corrupt inode %llu (btree).",
(unsigned long long) ip->i_ino);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_btree", dfp, size,
@@ -316,8 +209,6 @@ xfs_iformat_btree(
*/
xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
ifp->if_broot, size);
- ifp->if_flags &= ~XFS_IFEXTENTS;
- ifp->if_flags |= XFS_IFBROOT;
ifp->if_bytes = 0;
ifp->if_u1.if_root = NULL;
@@ -325,6 +216,124 @@ xfs_iformat_btree(
return 0;
}
+int
+xfs_iformat_data_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it.
+ */
+ ip->i_df.if_format = dip->di_format;
+ ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ ip->i_disk_size = 0;
+ inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
+ return 0;
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (ip->i_df.if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_iformat_local(ip, dip, XFS_DATA_FORK,
+ be64_to_cpu(dip->di_size));
+ if (!error)
+ error = xfs_ifork_verify_local_data(ip);
+ return error;
+ case XFS_DINODE_FMT_EXTENTS:
+ return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+ case XFS_DINODE_FMT_BTREE:
+ return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
+ return -EFSCORRUPTED;
+ }
+ break;
+ default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
+ return -EFSCORRUPTED;
+ }
+}
+
+static uint16_t
+xfs_dfork_attr_shortform_size(
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_shortform *atp =
+ (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+
+ return be16_to_cpu(atp->hdr.totsize);
+}
+
+void
+xfs_ifork_init_attr(
+ struct xfs_inode *ip,
+ enum xfs_dinode_fmt format,
+ xfs_extnum_t nextents)
+{
+ ip->i_af.if_format = format;
+ ip->i_af.if_nextents = nextents;
+}
+
+void
+xfs_ifork_zap_attr(
+ struct xfs_inode *ip)
+{
+ xfs_idestroy_fork(&ip->i_af);
+ memset(&ip->i_af, 0, sizeof(struct xfs_ifork));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
+}
+
+int
+xfs_iformat_attr_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
+ int error = 0;
+
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it.
+ */
+ xfs_ifork_init_attr(ip, dip->di_aformat, naextents);
+
+ switch (ip->i_af.if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
+ xfs_dfork_attr_shortform_size(dip));
+ if (!error)
+ error = xfs_ifork_verify_local_attr(ip);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+ break;
+ default:
+ xfs_inode_verifier_error(ip, error, __func__, dip,
+ sizeof(*dip), __this_address);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ if (error)
+ xfs_ifork_zap_attr(ip);
+ return error;
+}
+
/*
* Reallocate the space for if_broot based on the number of records
* being added or deleted as indicated in rec_diff. Move the records
@@ -365,7 +374,7 @@ xfs_iroot_realloc(
return;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
if (rec_diff > 0) {
/*
* If there wasn't any memory allocated before, just
@@ -387,15 +396,15 @@ xfs_iroot_realloc(
cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
- ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- KM_NOFS);
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
return;
}
@@ -422,7 +431,6 @@ xfs_iroot_realloc(
XFS_BMBT_BLOCK_LEN(ip->i_mount));
} else {
new_broot = NULL;
- ifp->if_flags &= ~XFS_IFBROOT;
}
/*
@@ -450,7 +458,7 @@ xfs_iroot_realloc(
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
return;
}
@@ -476,11 +484,11 @@ xfs_idata_realloc(
int64_t byte_diff,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int64_t new_size = ifp->if_bytes + byte_diff;
ASSERT(new_size >= 0);
- ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));
+ ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
if (byte_diff == 0)
return;
@@ -492,50 +500,30 @@ xfs_idata_realloc(
return;
}
- /*
- * For inline data, the underlying buffer must be a multiple of 4 bytes
- * in size so that it can be logged and stay on word boundaries.
- * We enforce that here.
- */
- ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
- roundup(new_size, 4), KM_NOFS);
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
void
xfs_idestroy_fork(
- xfs_inode_t *ip,
- int whichfork)
+ struct xfs_ifork *ifp)
{
- struct xfs_ifork *ifp;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
if (ifp->if_broot != NULL) {
kmem_free(ifp->if_broot);
ifp->if_broot = NULL;
}
- /*
- * If the format is local, then we can't have an extents
- * array so just look for an inline data array. If we're
- * not local then we may or may not have an extents list,
- * so check and free it up if we do.
- */
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- if (ifp->if_u1.if_data != NULL) {
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
- }
- } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) {
- xfs_iext_destroy(ifp);
- }
-
- if (whichfork == XFS_ATTR_FORK) {
- kmem_cache_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- } else if (whichfork == XFS_COW_FORK) {
- kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
- ip->i_cowfp = NULL;
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = NULL;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ if (ifp->if_height)
+ xfs_iext_destroy(ifp);
+ break;
}
}
@@ -555,7 +543,7 @@ xfs_iextents_copy(
int whichfork)
{
int state = xfs_bmap_fork_to_state(whichfork);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec rec;
int64_t copied = 0;
@@ -590,9 +578,9 @@ xfs_iextents_copy(
*/
void
xfs_iflush_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- xfs_inode_log_item_t *iip,
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_item *iip,
int whichfork)
{
char *cp;
@@ -607,7 +595,7 @@ xfs_iflush_fork(
if (!iip)
return;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
/*
* This can happen if we gave up in iformat in an error path,
* for the attribute fork.
@@ -618,22 +606,20 @@ xfs_iflush_fork(
}
cp = XFS_DFORK_PTR(dip, whichfork);
mp = ip->i_mount;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(ifp->if_u1.if_data != NULL);
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+ ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
}
break;
case XFS_DINODE_FMT_EXTENTS:
- ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
- !(iip->ili_fields & extflag[whichfork]));
if ((iip->ili_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+ ASSERT(ifp->if_nextents > 0);
(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
whichfork);
}
@@ -644,7 +630,7 @@ xfs_iflush_fork(
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -674,7 +660,7 @@ xfs_iext_state_to_fork(
if (state & BMAP_COWFORK)
return ip->i_cowfp;
else if (state & BMAP_ATTRFORK)
- return ip->i_afp;
+ return &ip->i_af;
return &ip->i_df;
}
@@ -688,51 +674,106 @@ xfs_ifork_init_cow(
if (ip->i_cowfp)
return;
- ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
- KM_NOFS);
- ip->i_cowfp->if_flags = XFS_IFEXTENTS;
- ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
- ip->i_cnextents = 0;
+ ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
-/* Default fork content verifiers. */
-struct xfs_ifork_ops xfs_default_ifork_ops = {
- .verify_attr = xfs_attr_shortform_verify,
- .verify_dir = xfs_dir2_sf_verify,
- .verify_symlink = xfs_symlink_shortform_verify,
-};
-
/* Verify the inline contents of the data fork of an inode. */
-xfs_failaddr_t
-xfs_ifork_verify_data(
- struct xfs_inode *ip,
- struct xfs_ifork_ops *ops)
+int
+xfs_ifork_verify_local_data(
+ struct xfs_inode *ip)
{
- /* Non-local data fork, we're done. */
- if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
- return NULL;
+ xfs_failaddr_t fa = NULL;
- /* Check the inline data fork if there is one. */
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFDIR:
- return ops->verify_dir(ip);
+ fa = xfs_dir2_sf_verify(ip);
+ break;
case S_IFLNK:
- return ops->verify_symlink(ip);
+ fa = xfs_symlink_shortform_verify(ip);
+ break;
default:
- return NULL;
+ break;
+ }
+
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+ ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+ return -EFSCORRUPTED;
}
+
+ return 0;
}
/* Verify the inline contents of the attr fork of an inode. */
-xfs_failaddr_t
-xfs_ifork_verify_attr(
+int
+xfs_ifork_verify_local_attr(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_af;
+ xfs_failaddr_t fa;
+
+ if (!xfs_inode_has_attr_fork(ip))
+ fa = __this_address;
+ else
+ fa = xfs_attr_shortform_verify(ip);
+
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+ ifp->if_u1.if_data, ifp->if_bytes, fa);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+int
+xfs_iext_count_may_overflow(
+ struct xfs_inode *ip,
+ int whichfork,
+ int nr_to_add)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ uint64_t max_exts;
+ uint64_t nr_exts;
+
+ if (whichfork == XFS_COW_FORK)
+ return 0;
+
+ max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+ whichfork);
+
+ if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ max_exts = 10;
+
+ nr_exts = ifp->if_nextents + nr_to_add;
+ if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+ return -EFBIG;
+
+ return 0;
+}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add. Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
- struct xfs_ifork_ops *ops)
+ uint nr_to_add)
{
- /* There has to be an attr fork allocated if aformat is local. */
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return NULL;
- if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK))
- return __this_address;
- return ops->verify_attr(ip);
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+ if (!xfs_has_large_extent_counts(ip->i_mount) ||
+ xfs_inode_has_large_extent_counts(ip) ||
+ XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ return -EFBIG;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 500333d0101e..d3943d6ad0b9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -21,82 +21,154 @@ struct xfs_ifork {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
- unsigned char if_flags; /* per-fork flags */
+ int8_t if_format; /* format of this fork */
};
/*
- * Per-fork incore inode flags.
+ * Worst-case increase in the fork extent count when we're adding a single
+ * extent to a fork and there's no possibility of splitting an existing mapping.
*/
-#define XFS_IFINLINE 0x01 /* Inline data is read in */
-#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
-#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
+#define XFS_IEXT_ADD_NOSPLIT_CNT (1)
/*
- * Fork handling.
+ * Punching out an extent from the middle of an existing extent can cause the
+ * extent count to increase by 1.
+ * i.e. | Old extent | Hole | Old extent |
+ */
+#define XFS_IEXT_PUNCH_HOLE_CNT (1)
+
+/*
+ * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
+ * be added. One extra extent for dabtree in case a local attr is
+ * large enough to cause a double split. It can also cause extent
+ * count to increase proportional to the size of a remote xattr's
+ * value.
+ */
+#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \
+ (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks))
+
+/*
+ * A write to a sub-interval of an existing unwritten extent causes the original
+ * extent to be split into 3 extents
+ * i.e. | Unwritten | Real | Unwritten |
+ * Hence extent count can increase by 2.
+ */
+#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2)
+
+
+/*
+ * Moving an extent to data fork can cause a sub-interval of an existing extent
+ * to be unmapped. This will increase extent count by 1. Mapping in the new
+ * extent can increase the extent count by 1 again i.e.
+ * | Old extent | New extent | Old extent |
+ * Hence number of extents increases by 2.
*/
+#define XFS_IEXT_REFLINK_END_COW_CNT (2)
-#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- &(ip)->i_df : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_afp : \
- (ip)->i_cowfp))
-#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
-#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
- XFS_IFORK_BOFF(ip) : \
- 0)
-#define XFS_IFORK_SIZE(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_IFORK_DSIZE(ip) : \
- ((w) == XFS_ATTR_FORK ? \
- XFS_IFORK_ASIZE(ip) : \
- 0))
-#define XFS_IFORK_FORMAT(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_format : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_d.di_aformat : \
- (ip)->i_cformat))
-#define XFS_IFORK_FMT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_format = (n)) : \
- ((w) == XFS_ATTR_FORK ? \
- ((ip)->i_d.di_aformat = (n)) : \
- ((ip)->i_cformat = (n))))
-#define XFS_IFORK_NEXTENTS(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_nextents : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_d.di_anextents : \
- (ip)->i_cnextents))
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_nextents = (n)) : \
- ((w) == XFS_ATTR_FORK ? \
- ((ip)->i_d.di_anextents = (n)) : \
- ((ip)->i_cnextents = (n))))
+/*
+ * Removing an initial range of source/donor file's extent and adding a new
+ * extent (from donor/source file) in its place will cause extent count to
+ * increase by 1.
+ */
+#define XFS_IEXT_SWAP_RMAP_CNT (1)
+
+/*
+ * Fork handling.
+ */
#define XFS_IFORK_MAXEXT(ip, w) \
- (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+ (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
+{
+ return ifp->if_format == XFS_DINODE_FMT_EXTENTS ||
+ ifp->if_format == XFS_DINODE_FMT_BTREE;
+}
+
+static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp)
+{
+ if (!ifp)
+ return 0;
+ return ifp->if_nextents;
+}
-#define xfs_ifork_has_extents(ip, w) \
- (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \
- XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE)
+static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
+{
+ if (!ifp)
+ return XFS_DINODE_FMT_EXTENTS;
+ return ifp->if_format;
+}
+
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ case XFS_COW_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+ return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+ case XFS_ATTR_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be64_to_cpu(dip->di_big_nextents);
+ return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be32_to_cpu(dip->di_big_anextents);
+
+ return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_dfork_data_extents(dip);
+ case XFS_ATTR_FORK:
+ return xfs_dfork_attr_extents(dip);
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return 0;
+}
+
+void xfs_ifork_zap_attr(struct xfs_inode *ip);
+void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format,
+ xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
-int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
+int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
-void xfs_idestroy_fork(struct xfs_inode *, int);
+void xfs_idestroy_fork(struct xfs_ifork *ifp);
void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
@@ -176,22 +248,21 @@ static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
xfs_iext_get_extent((ifp), (ext), (got)); \
xfs_iext_next((ifp), (ext)))
-extern struct kmem_zone *xfs_ifork_zone;
+extern struct kmem_cache *xfs_ifork_cache;
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
-typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *);
-
-struct xfs_ifork_ops {
- xfs_ifork_verifier_t verify_symlink;
- xfs_ifork_verifier_t verify_dir;
- xfs_ifork_verifier_t verify_attr;
-};
-extern struct xfs_ifork_ops xfs_default_ifork_ops;
+int xfs_ifork_verify_local_data(struct xfs_inode *ip);
+int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
+int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
+ int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+ uint nr_to_add);
-xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip,
- struct xfs_ifork_ops *ops);
-xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip,
- struct xfs_ifork_ops *ops);
+/* returns true if the fork has extents but they are not read in yet. */
+static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
+{
+ return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+}
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 9bac0d2e56dc..f13e0809dc63 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t;
#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
- (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
#define XLOG_HEADER_SIZE 512
@@ -44,10 +41,10 @@ typedef uint32_t xlog_tid_t;
#define XFS_MIN_LOG_FACTOR 3
#define XLOG_REC_SHIFT(log) \
- BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
#define XLOG_TOTAL_REC_SHIFT(log) \
- BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
/* get lsn fields */
@@ -72,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr)
/* Log Clients */
#define XFS_TRANSACTION 0x69
-#define XFS_VOLUME 0x2
#define XFS_LOG 0xaa
#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
@@ -117,7 +113,12 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_CUD_FORMAT 24
#define XLOG_REG_TYPE_BUI_FORMAT 25
#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_MAX 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_MAX 30
+
/*
* Flags to log operation header
@@ -240,6 +241,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_CUD 0x1243
#define XFS_LI_BUI 0x1244 /* bmbt update intent */
#define XFS_LI_BUD 0x1245
+#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
+#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -255,7 +258,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_CUI, "XFS_LI_CUI" }, \
{ XFS_LI_CUD, "XFS_LI_CUD" }, \
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
- { XFS_LI_BUD, "XFS_LI_BUD" }
+ { XFS_LI_BUD, "XFS_LI_BUD" }, \
+ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
/*
* Inode Log Item Format definitions.
@@ -368,10 +373,13 @@ static inline int xfs_ilog_fdata(int w)
* directly mirrors the xfs_dinode structure as it must contain all the same
* information.
*/
-typedef struct xfs_ictimestamp {
+typedef uint64_t xfs_log_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_log_legacy_timestamp {
int32_t t_sec; /* timestamp seconds */
int32_t t_nsec; /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
+};
/*
* Define the format of the inode core that is logged. This structure must be
@@ -388,16 +396,41 @@ struct xfs_log_dinode {
uint32_t di_nlink; /* number of links to file */
uint16_t di_projid_lo; /* lower part of owner's project id */
uint16_t di_projid_hi; /* higher part of owner's project id */
- uint8_t di_pad[6]; /* unused, zeroed space */
- uint16_t di_flushiter; /* incremented on flush */
- xfs_ictimestamp_t di_atime; /* time last accessed */
- xfs_ictimestamp_t di_mtime; /* time last modified */
- xfs_ictimestamp_t di_ctime; /* time created/inode modified */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ uint64_t di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ uint64_t di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
+ uint16_t di_flushiter; /* V2 inode incremented on flush */
+ };
+ };
+ xfs_log_timestamp_t di_atime; /* time last accessed */
+ xfs_log_timestamp_t di_mtime; /* time last modified */
+ xfs_log_timestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ uint32_t di_nextents;
+ uint16_t di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ uint32_t di_big_anextents;
+ uint16_t di_nrext64_pad;
+ } __packed;
+ } __packed;
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
@@ -411,25 +444,32 @@ struct xfs_log_dinode {
/* start of the extended dinode, writable fields */
uint32_t di_crc; /* CRC of the inode */
uint64_t di_changecount; /* number of attribute changes */
- xfs_lsn_t di_lsn; /* flush sequence */
+
+ /*
+ * The LSN we write to this field during formatting is not a reflection
+ * of the current on-disk LSN. It should never be used for recovery
+ * sequencing, nor should it be recovered into the on-disk inode at all.
+ * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
+ * for details.
+ */
+ xfs_lsn_t di_lsn;
+
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
- xfs_ictimestamp_t di_crtime; /* time created */
+ xfs_log_timestamp_t di_crtime; /* time created */
xfs_ino_t di_ino; /* inode number */
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
};
-static inline uint xfs_log_dinode_size(int version)
-{
- if (version == 3)
- return sizeof(struct xfs_log_dinode);
- return offsetof(struct xfs_log_dinode, di_next_unlinked);
-}
+#define xfs_log_dinode_size(mp) \
+ (xfs_has_v3inodes((mp)) ? \
+ sizeof(struct xfs_log_dinode) : \
+ offsetof(struct xfs_log_dinode, di_next_unlinked))
/*
* Buffer Log Format definitions
@@ -573,25 +613,49 @@ typedef struct xfs_efi_log_format {
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_t efi_extents[]; /* array of extents to free */
} xfs_efi_log_format_t;
+static inline size_t
+xfs_efi_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
typedef struct xfs_efi_log_format_32 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_32_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_32_t efi_extents[]; /* array of extents to free */
} __attribute__((packed)) xfs_efi_log_format_32_t;
+static inline size_t
+xfs_efi_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
typedef struct xfs_efi_log_format_64 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_64_t efi_extents[1]; /* array of extents to free */
+ xfs_extent_64_t efi_extents[]; /* array of extents to free */
} xfs_efi_log_format_64_t;
+static inline size_t
+xfs_efi_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efi_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
/*
* This is the structure used to lay out an efd log item in the
* log. The efd_extents array is a variable size array whose
@@ -602,25 +666,49 @@ typedef struct xfs_efd_log_format {
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_t efd_extents[]; /* array of extents freed */
} xfs_efd_log_format_t;
+static inline size_t
+xfs_efd_log_format_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format) +
+ nr * sizeof(struct xfs_extent);
+}
+
typedef struct xfs_efd_log_format_32 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_32_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_32_t efd_extents[]; /* array of extents freed */
} __attribute__((packed)) xfs_efd_log_format_32_t;
+static inline size_t
+xfs_efd_log_format32_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_32) +
+ nr * sizeof(struct xfs_extent_32);
+}
+
typedef struct xfs_efd_log_format_64 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_64_t efd_extents[1]; /* array of extents freed */
+ xfs_extent_64_t efd_extents[]; /* array of extents freed */
} xfs_efd_log_format_64_t;
+static inline size_t
+xfs_efd_log_format64_sizeof(
+ unsigned int nr)
+{
+ return sizeof(struct xfs_efd_log_format_64) +
+ nr * sizeof(struct xfs_extent_64);
+}
+
/*
* RUI/RUD (reverse mapping) log format definitions
*/
@@ -862,4 +950,44 @@ struct xfs_icreate_log {
__be32 icl_gen; /* inode generation number to use */
};
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
+#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
+#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
+
+/*
+ * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should
+ * never have any other bits set.
+ */
+#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
+ XFS_ATTR_SECURE | \
+ XFS_ATTR_INCOMPLETE)
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+ uint16_t alfi_type; /* attri log item type */
+ uint16_t alfi_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfi_id; /* attri identifier */
+ uint64_t alfi_ino; /* the inode for this attr operation */
+ uint32_t alfi_op_flags; /* marks the op as a set or remove */
+ uint32_t alfi_name_len; /* attr name length */
+ uint32_t alfi_value_len; /* attr value length */
+ uint32_t alfi_attr_filter;/* attr filter flags */
+};
+
+struct xfs_attrd_log_format {
+ uint16_t alfd_type; /* attrd log item type */
+ uint16_t alfd_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfd_alf_id; /* id of corresponding attri */
+};
+
#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 3bf671637a91..2420865f3007 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -7,6 +7,75 @@
#define __XFS_LOG_RECOVER_H__
/*
+ * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to
+ * define how recovery should work for that type of log item.
+ */
+struct xlog_recover_item;
+
+/* Sorting hat for log items as they're read in. */
+enum xlog_recover_reorder {
+ XLOG_REORDER_BUFFER_LIST,
+ XLOG_REORDER_ITEM_LIST,
+ XLOG_REORDER_INODE_BUFFER_LIST,
+ XLOG_REORDER_CANCEL_LIST,
+};
+
+struct xlog_recover_item_ops {
+ uint16_t item_type; /* XFS_LI_* type code. */
+
+ /*
+ * Help sort recovered log items into the order required to replay them
+ * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do
+ * not have to supply a function here. See the comment preceding
+ * xlog_recover_reorder_trans for more details about what the return
+ * values mean.
+ */
+ enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item);
+
+ /* Start readahead for pass2, if provided. */
+ void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item);
+
+ /* Do whatever work we need to do for pass1, if provided. */
+ int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item);
+
+ /*
+ * This function should do whatever work is needed for pass2 of log
+ * recovery, if provided.
+ *
+ * If the recovered item is an intent item, this function should parse
+ * the recovered item to construct an in-core log intent item and
+ * insert it into the AIL. The in-core log intent item should have 1
+ * refcount so that the item is freed either (a) when we commit the
+ * recovered log item for the intent-done item; (b) replay the work and
+ * log a new intent-done item; or (c) recovery fails and we have to
+ * abort.
+ *
+ * If the recovered item is an intent-done item, this function should
+ * parse the recovered item to find the id of the corresponding intent
+ * log item. Next, it should find the in-core log intent item in the
+ * AIL and release it.
+ */
+ int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list,
+ struct xlog_recover_item *item, xfs_lsn_t lsn);
+};
+
+extern const struct xlog_recover_item_ops xlog_icreate_item_ops;
+extern const struct xlog_recover_item_ops xlog_buf_item_ops;
+extern const struct xlog_recover_item_ops xlog_inode_item_ops;
+extern const struct xlog_recover_item_ops xlog_dquot_item_ops;
+extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops;
+extern const struct xlog_recover_item_ops xlog_bui_item_ops;
+extern const struct xlog_recover_item_ops xlog_bud_item_ops;
+extern const struct xlog_recover_item_ops xlog_efi_item_ops;
+extern const struct xlog_recover_item_ops xlog_efd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rud_item_ops;
+extern const struct xlog_recover_item_ops xlog_cui_item_ops;
+extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
+
+/*
* Macros, structures, prototypes for internal log manager use.
*/
@@ -22,13 +91,13 @@
/*
* item headers are in ri_buf[0]. Additional buffers follow.
*/
-typedef struct xlog_recover_item {
+struct xlog_recover_item {
struct list_head ri_list;
- int ri_type;
int ri_cnt; /* count of regions found */
int ri_total; /* total regions */
- xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
-} xlog_recover_item_t;
+ struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */
+ const struct xlog_recover_item_ops *ri_ops;
+};
struct xlog_recover {
struct hlist_node r_list;
@@ -41,14 +110,25 @@ struct xlog_recover {
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
-/*
- * This is the number of entries in the l_buf_cancel_table used during
- * recovery.
- */
-#define XLOG_BC_TABLE_SIZE 64
-
#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
+void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
+ const struct xfs_buf_ops *ops);
+bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
+
+int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
+ struct xfs_inode **ipp);
+void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
+ uint64_t intent_id);
+int xlog_alloc_buf_cancel_table(struct xlog *log);
+void xlog_free_buf_cancel_table(struct xlog *log);
+
+#ifdef DEBUG
+void xlog_check_buf_cancel_table(struct xlog *log);
+#else
+#define xlog_check_buf_cancel_table(log) do { } while (0)
+#endif
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 7f55eb3f3653..9975b93a7412 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -14,6 +14,7 @@
#include "xfs_trans_space.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
/*
* Calculate the maximum length in bytes that would be required for a local
@@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res(
}
/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resv)
+{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+ xfs_trans_resv_calc(mp, resv);
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * In the early days of reflink, typical log operation counts
+ * were greatly overestimated.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ resv->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ } else if (xfs_has_rmapbt(mp)) {
+ /*
+ * In the early days of non-reflink rmap, the impact of rmapbt
+ * updates on log counts were not taken into account at all.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ }
+
+ /*
+ * In the early days of reflink, we did not use deferred refcount
+ * update log items, so log reservations must be recomputed using the
+ * old calculations.
+ */
+ resv->tr_write.tr_logres =
+ xfs_calc_write_reservation_minlogsize(mp);
+ resv->tr_itruncate.tr_logres =
+ xfs_calc_itruncate_reservation_minlogsize(mp);
+ resv->tr_qm_dqalloc.tr_logres =
+ xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
* Iterate over the log space reservation table to figure out and return
* the maximum one in terms of the pre-calculated values which were done
* at mount time.
@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
struct xfs_mount *mp,
struct xfs_trans_res *max_resp)
{
+ struct xfs_trans_resv resv = {};
struct xfs_trans_res *resp;
struct xfs_trans_res *end_resp;
+ unsigned int i;
int log_space = 0;
int attr_space;
attr_space = xfs_log_calc_max_attrsetm_res(mp);
- resp = (struct xfs_trans_res *)M_RES(mp);
- end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
- for (; resp < end_resp; resp++) {
+ xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+ resp = (struct xfs_trans_res *)&resv;
+ end_resp = (struct xfs_trans_res *)(&resv + 1);
+ for (i = 0; resp < end_resp; i++, resp++) {
int tmp = resp->tr_logcount > 1 ?
resp->tr_logres * resp->tr_logcount :
resp->tr_logres;
+
+ trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
if (log_space < tmp) {
log_space = tmp;
*max_resp = *resp; /* struct copy */
@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
}
if (attr_space > log_space) {
- *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
+ *max_resp = resv.tr_attrsetm; /* struct copy */
max_resp->tr_logres = attr_space;
}
+ trace_xfs_log_get_max_trans_res(mp, max_resp);
}
/*
@@ -92,7 +159,7 @@ xfs_log_calc_minimum_size(
if (tres.tr_logcount > 1)
max_logres *= tres.tr_logcount;
- if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
lsunit = BTOBB(mp->m_sb.sb_logsunit);
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index b2113b17e53c..cb035da3f990 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -16,25 +16,24 @@
* and quota-limits. This is a waste in the common case, but hey ...
*/
typedef uint64_t xfs_qcnt_t;
-typedef uint16_t xfs_qwarncnt_t;
+
+typedef uint8_t xfs_dqtype_t;
+
+#define XFS_DQTYPE_STRINGS \
+ { XFS_DQTYPE_USER, "USER" }, \
+ { XFS_DQTYPE_PROJ, "PROJ" }, \
+ { XFS_DQTYPE_GROUP, "GROUP" }, \
+ { XFS_DQTYPE_BIGTIME, "BIGTIME" }
/*
* flags for q_flags field in the dquot.
*/
-#define XFS_DQ_USER 0x0001 /* a user quota */
-#define XFS_DQ_PROJ 0x0002 /* project quota */
-#define XFS_DQ_GROUP 0x0004 /* a group quota */
-#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */
-
-#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
-#define XFS_DQ_FLAGS \
- { XFS_DQ_USER, "USER" }, \
- { XFS_DQ_PROJ, "PROJ" }, \
- { XFS_DQ_GROUP, "GROUP" }, \
- { XFS_DQ_DIRTY, "DIRTY" }, \
- { XFS_DQ_FREEING, "FREEING" }
+#define XFS_DQFLAG_STRINGS \
+ { XFS_DQFLAG_DIRTY, "DIRTY" }, \
+ { XFS_DQFLAG_FREEING, "FREEING" }
/*
* We have the possibility of all three quota types being active at once, and
@@ -60,65 +59,58 @@ typedef uint16_t xfs_qwarncnt_t;
#define XFS_DQUOT_LOGRES(mp) \
((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
-#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
/*
- * Incore only flags for quotaoff - these bits get cleared when quota(s)
- * are in the process of getting turned off. These flags are in m_qflags but
- * never in sb_qflags.
- */
-#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
-#define XFS_ALL_QUOTA_ACTIVE \
- (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
-
-/*
- * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
- * quota will be not be switched off as long as that inode lock is held.
- */
-#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
- XFS_GQUOTA_ACTIVE | \
- XFS_PQUOTA_ACTIVE))
-#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
-#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
-#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
-
-/*
* Flags to tell various functions what to do. Not all of these are meaningful
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
+#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
* modified.
*/
-#define XFS_QMOPT_RES_REGBLKS 0x0010000
-#define XFS_QMOPT_RES_RTBLKS 0x0020000
-#define XFS_QMOPT_BCOUNT 0x0040000
-#define XFS_QMOPT_ICOUNT 0x0080000
-#define XFS_QMOPT_RTBCOUNT 0x0100000
-#define XFS_QMOPT_DELBCOUNT 0x0200000
-#define XFS_QMOPT_DELRTBCOUNT 0x0400000
-#define XFS_QMOPT_RES_INOS 0x0800000
+#define XFS_QMOPT_RES_REGBLKS (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS (1u << 8)
+#define XFS_QMOPT_BCOUNT (1u << 9)
+#define XFS_QMOPT_ICOUNT (1u << 10)
+#define XFS_QMOPT_RTBCOUNT (1u << 11)
+#define XFS_QMOPT_DELBCOUNT (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT (1u << 13)
+#define XFS_QMOPT_RES_INOS (1u << 14)
/*
* flags for dqalloc.
*/
-#define XFS_QMOPT_INHERIT 0x1000000
+#define XFS_QMOPT_INHERIT (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
/*
* flags to xfs_trans_mod_dquot.
@@ -137,12 +129,18 @@ typedef uint16_t xfs_qwarncnt_t;
(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
- struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
+ struct xfs_disk_dquot *ddq, xfs_dqid_t id);
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
- struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
+ struct xfs_dqblk *dqb, xfs_dqid_t id);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
- xfs_dqid_t id, uint type);
+ xfs_dqid_t id, xfs_dqtype_t type);
+
+struct xfs_dquot;
+time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
+ __be32 dtimer);
+__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6e1665f2cb67..3f34bafe18dd 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -22,6 +22,9 @@
#include "xfs_bit.h"
#include "xfs_refcount.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+struct kmem_cache *xfs_refcount_intent_cache;
/* Allowable refcount adjustment amounts. */
enum xfs_refc_adjust_op {
@@ -43,13 +46,16 @@ STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
int
xfs_refcount_lookup_le(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
}
@@ -60,13 +66,16 @@ xfs_refcount_lookup_le(
int
xfs_refcount_lookup_ge(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
@@ -77,23 +86,36 @@ xfs_refcount_lookup_ge(
int
xfs_refcount_lookup_eq(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
+ cur->bc_rec.rc.rc_domain = domain;
return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
}
/* Convert on-disk record to in-core format. */
void
xfs_refcount_btrec_to_irec(
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec)
{
- irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+ uint32_t start;
+
+ start = be32_to_cpu(rec->refc.rc_startblock);
+ if (start & XFS_REFC_COWFLAG) {
+ start &= ~XFS_REFC_COWFLAG;
+ irec->rc_domain = XFS_REFC_DOMAIN_COW;
+ } else {
+ irec->rc_domain = XFS_REFC_DOMAIN_SHARED;
+ }
+
+ irec->rc_startblock = start;
irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
}
@@ -108,48 +130,35 @@ xfs_refcount_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
- xfs_agblock_t realstart;
error = xfs_btree_get_rec(cur, &rec, stat);
if (error || !*stat)
return error;
xfs_refcount_btrec_to_irec(rec, irec);
-
- agno = cur->bc_private.a.agno;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
- /* handle special COW-staging state */
- realstart = irec->rc_startblock;
- if (realstart & XFS_REFC_COW_START) {
- if (irec->rc_refcount != 1)
- goto out_bad_rec;
- realstart &= ~XFS_REFC_COW_START;
- } else if (irec->rc_refcount < 2) {
+ if (!xfs_refcount_check_domain(irec))
goto out_bad_rec;
- }
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, realstart))
- goto out_bad_rec;
- if (realstart > realstart + irec->rc_blockcount)
- goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1))
+ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
goto out_bad_rec;
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
goto out_bad_rec;
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
return 0;
out_bad_rec:
xfs_warn(mp,
- "Refcount BTree record corruption in AG %d detected!", agno);
+ "Refcount BTree record corruption in AG %d detected!",
+ pag->pag_agno);
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -167,16 +176,21 @@ xfs_refcount_update(
struct xfs_refcount_irec *irec)
{
union xfs_btree_rec rec;
+ uint32_t start;
int error;
- trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
- rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec.refc.rc_startblock = cpu_to_be32(start);
rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_refcount_update_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -193,10 +207,13 @@ xfs_refcount_insert(
{
int error;
- trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+ cur->bc_rec.rc.rc_domain = irec->rc_domain;
+
error = xfs_btree_insert(cur, i);
if (error)
goto out_error;
@@ -208,7 +225,7 @@ xfs_refcount_insert(
out_error:
if (error)
trace_xfs_refcount_insert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -234,7 +251,7 @@ xfs_refcount_delete(
error = -EFSCORRUPTED;
goto out_error;
}
- trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
error = -EFSCORRUPTED;
@@ -242,11 +259,12 @@ xfs_refcount_delete(
}
if (error)
goto out_error;
- error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, irec.rc_domain, irec.rc_startblock,
+ &found_rec);
out_error:
if (error)
trace_xfs_refcount_delete_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -341,6 +359,7 @@ xfs_refc_next(
STATIC int
xfs_refcount_split_extent(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
bool *shape_changed)
{
@@ -349,7 +368,7 @@ xfs_refcount_split_extent(
int error;
*shape_changed = false;
- error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+ error = xfs_refcount_lookup_le(cur, domain, agbno, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -362,11 +381,13 @@ xfs_refcount_split_extent(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (rcext.rc_domain != domain)
+ return 0;
if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
return 0;
*shape_changed = true;
- trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
&rcext, agbno);
/* Establish the right extent. */
@@ -391,7 +412,7 @@ xfs_refcount_split_extent(
out_error:
trace_xfs_refcount_split_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -411,7 +432,10 @@ xfs_refcount_merge_center_extents(
int found_rec;
trace_xfs_refcount_merge_center_extents(cur->bc_mp,
- cur->bc_private.a.agno, left, center, right);
+ cur->bc_ag.pag->pag_agno, left, center, right);
+
+ ASSERT(left->rc_domain == center->rc_domain);
+ ASSERT(right->rc_domain == center->rc_domain);
/*
* Make sure the center and right extents are not in the btree.
@@ -421,8 +445,8 @@ xfs_refcount_merge_center_extents(
* call removes the center and the second one removes the right
* extent.
*/
- error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_ge(cur, center->rc_domain,
+ center->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -449,8 +473,8 @@ xfs_refcount_merge_center_extents(
}
/* Enlarge the left extent. */
- error = xfs_refcount_lookup_le(cur, left->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -468,7 +492,7 @@ xfs_refcount_merge_center_extents(
out_error:
trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -487,12 +511,14 @@ xfs_refcount_merge_left_extent(
int found_rec;
trace_xfs_refcount_merge_left_extent(cur->bc_mp,
- cur->bc_private.a.agno, left, cleft);
+ cur->bc_ag.pag->pag_agno, left, cleft);
+
+ ASSERT(left->rc_domain == cleft->rc_domain);
/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
if (cleft->rc_refcount > 1) {
- error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, cleft->rc_domain,
+ cleft->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -510,8 +536,8 @@ xfs_refcount_merge_left_extent(
}
/* Enlarge the left extent. */
- error = xfs_refcount_lookup_le(cur, left->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, left->rc_domain,
+ left->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -530,7 +556,7 @@ xfs_refcount_merge_left_extent(
out_error:
trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -548,15 +574,17 @@ xfs_refcount_merge_right_extent(
int found_rec;
trace_xfs_refcount_merge_right_extent(cur->bc_mp,
- cur->bc_private.a.agno, cright, right);
+ cur->bc_ag.pag->pag_agno, cright, right);
+
+ ASSERT(right->rc_domain == cright->rc_domain);
/*
* If the extent ending at agbno+aglen (cright) wasn't synthesized,
* remove it.
*/
if (cright->rc_refcount > 1) {
- error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, cright->rc_domain,
+ cright->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -574,8 +602,8 @@ xfs_refcount_merge_right_extent(
}
/* Enlarge the right extent. */
- error = xfs_refcount_lookup_le(cur, right->rc_startblock,
- &found_rec);
+ error = xfs_refcount_lookup_le(cur, right->rc_domain,
+ right->rc_startblock, &found_rec);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
@@ -594,12 +622,10 @@ xfs_refcount_merge_right_extent(
out_error:
trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
-#define XFS_FIND_RCEXT_SHARED 1
-#define XFS_FIND_RCEXT_COW 2
/*
* Find the left extent and the one after it (cleft). This function assumes
* that we've already split any extent crossing agbno.
@@ -609,16 +635,16 @@ xfs_refcount_find_left_extents(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *left,
struct xfs_refcount_irec *cleft,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- int flags)
+ xfs_extlen_t aglen)
{
struct xfs_refcount_irec tmp;
int error;
int found_rec;
left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
- error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+ error = xfs_refcount_lookup_le(cur, domain, agbno - 1, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -632,11 +658,9 @@ xfs_refcount_find_left_extents(
goto out_error;
}
- if (xfs_refc_next(&tmp) != agbno)
- return 0;
- if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+ if (tmp.rc_domain != domain)
return 0;
- if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ if (xfs_refc_next(&tmp) != agbno)
return 0;
/* We have a left extent; retrieve (or invent) the next right one */
*left = tmp;
@@ -653,6 +677,9 @@ xfs_refcount_find_left_extents(
goto out_error;
}
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
/* if tmp starts at the end of our range, just use that */
if (tmp.rc_startblock == agbno)
*cleft = tmp;
@@ -669,8 +696,10 @@ xfs_refcount_find_left_extents(
cleft->rc_blockcount = min(aglen,
tmp.rc_startblock - agbno);
cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
}
} else {
+not_found:
/*
* No extents, so pretend that there's one covering the whole
* range.
@@ -678,14 +707,15 @@ xfs_refcount_find_left_extents(
cleft->rc_startblock = agbno;
cleft->rc_blockcount = aglen;
cleft->rc_refcount = 1;
+ cleft->rc_domain = domain;
}
- trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
left, cleft, agbno);
return error;
out_error:
trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -698,16 +728,16 @@ xfs_refcount_find_right_extents(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *right,
struct xfs_refcount_irec *cright,
+ enum xfs_refc_domain domain,
xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- int flags)
+ xfs_extlen_t aglen)
{
struct xfs_refcount_irec tmp;
int error;
int found_rec;
right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
- error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, domain, agbno + aglen, &found_rec);
if (error)
goto out_error;
if (!found_rec)
@@ -721,11 +751,9 @@ xfs_refcount_find_right_extents(
goto out_error;
}
- if (tmp.rc_startblock != agbno + aglen)
+ if (tmp.rc_domain != domain)
return 0;
- if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
- return 0;
- if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+ if (tmp.rc_startblock != agbno + aglen)
return 0;
/* We have a right extent; retrieve (or invent) the next left one */
*right = tmp;
@@ -742,6 +770,9 @@ xfs_refcount_find_right_extents(
goto out_error;
}
+ if (tmp.rc_domain != domain)
+ goto not_found;
+
/* if tmp ends at the end of our range, just use that */
if (xfs_refc_next(&tmp) == agbno + aglen)
*cright = tmp;
@@ -758,8 +789,10 @@ xfs_refcount_find_right_extents(
cright->rc_blockcount = right->rc_startblock -
cright->rc_startblock;
cright->rc_refcount = 1;
+ cright->rc_domain = domain;
}
} else {
+not_found:
/*
* No extents, so pretend that there's one covering the whole
* range.
@@ -767,14 +800,15 @@ xfs_refcount_find_right_extents(
cright->rc_startblock = agbno;
cright->rc_blockcount = aglen;
cright->rc_refcount = 1;
+ cright->rc_domain = domain;
}
- trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
cright, right, agbno + aglen);
return error;
out_error:
trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -792,10 +826,10 @@ xfs_refc_valid(
STATIC int
xfs_refcount_merge_extents(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
enum xfs_refc_adjust_op adjust,
- int flags,
bool *shape_changed)
{
struct xfs_refcount_irec left = {0}, cleft = {0};
@@ -810,12 +844,12 @@ xfs_refcount_merge_extents(
* just below (agbno + aglen) [cright], and just above (agbno + aglen)
* [right].
*/
- error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
- *aglen, flags);
+ error = xfs_refcount_find_left_extents(cur, &left, &cleft, domain,
+ *agbno, *aglen);
if (error)
return error;
- error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
- *aglen, flags);
+ error = xfs_refcount_find_right_extents(cur, &right, &cright, domain,
+ *agbno, *aglen);
if (error)
return error;
@@ -868,7 +902,7 @@ xfs_refcount_merge_extents(
aglen);
}
- return error;
+ return 0;
}
/*
@@ -883,25 +917,30 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
- overhead = cur->bc_private.a.priv.refc.shape_changes *
- xfs_allocfree_log_count(cur->bc_mp, 1);
+ /*
+ * Worst case estimate: full splits of the free space and rmap btrees
+ * to handle each of the shape changes to the refcount btree.
+ */
+ overhead = xfs_allocfree_block_count(cur->bc_mp,
+ cur->bc_ag.refc.shape_changes);
+ overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
* Only allow 2 refcount extent updates per transaction if the
* refcount continue update "error" has been injected.
*/
- if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+ if (cur->bc_ag.refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
- if (cur->bc_private.a.priv.refc.nr_ops == 0)
+ if (cur->bc_ag.refc.nr_ops == 0)
return true;
else if (overhead > cur->bc_tp->t_log_res)
return false;
return cur->bc_tp->t_log_res - overhead >
- cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -915,8 +954,7 @@ xfs_refcount_adjust_extents(
struct xfs_btree_cur *cur,
xfs_agblock_t *agbno,
xfs_extlen_t *aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
struct xfs_refcount_irec ext, tmp;
int error;
@@ -927,7 +965,8 @@ xfs_refcount_adjust_extents(
if (*aglen == 0)
return 0;
- error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_SHARED, *agbno,
+ &found_rec);
if (error)
goto out_error;
@@ -935,10 +974,11 @@ xfs_refcount_adjust_extents(
error = xfs_refcount_get_rec(cur, &ext, &found_rec);
if (error)
goto out_error;
- if (!found_rec) {
+ if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) {
ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_SHARED;
}
/*
@@ -951,13 +991,16 @@ xfs_refcount_adjust_extents(
tmp.rc_blockcount = min(*aglen,
ext.rc_startblock - *agbno);
tmp.rc_refcount = 1 + adj;
+ tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
+
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &tmp);
+ cur->bc_ag.pag->pag_agno, &tmp);
/*
* Either cover the hole (increment) or
* delete the range (decrement).
*/
+ cur->bc_ag.refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -968,27 +1011,41 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_private.a.priv.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_private.a.agno,
+ cur->bc_ag.pag->pag_agno,
tmp.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno,
- tmp.rc_blockcount, oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ tmp.rc_blockcount, NULL);
}
(*agbno) += tmp.rc_blockcount;
(*aglen) -= tmp.rc_blockcount;
- error = xfs_refcount_lookup_ge(cur, *agbno,
+ /* Stop if there's nothing left to modify */
+ if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+ break;
+
+ /* Move the cursor to the start of ext. */
+ error = xfs_refcount_lookup_ge(cur,
+ XFS_REFC_DOMAIN_SHARED, *agbno,
&found_rec);
if (error)
goto out_error;
}
- /* Stop if there's nothing left to modify */
- if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
- break;
+ /*
+ * A previous step trimmed agbno/aglen such that the end of the
+ * range would not be in the middle of the record. If this is
+ * no longer the case, something is seriously wrong with the
+ * btree. Make sure we never feed the synthesized record into
+ * the processing loop below.
+ */
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
+ XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/*
* Adjust the reference count and either update the tree
@@ -998,12 +1055,12 @@ xfs_refcount_adjust_extents(
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &ext);
+ cur->bc_ag.pag->pag_agno, &ext);
+ cur->bc_ag.refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_private.a.priv.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
@@ -1012,14 +1069,13 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_private.a.priv.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_private.a.agno,
+ cur->bc_ag.pag->pag_agno,
ext.rc_startblock);
- xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
- oinfo);
+ xfs_free_extent_later(cur->bc_tp, fsbno,
+ ext.rc_blockcount, NULL);
}
skip:
@@ -1035,7 +1091,7 @@ advloop:
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1047,8 +1103,7 @@ xfs_refcount_adjust(
xfs_extlen_t aglen,
xfs_agblock_t *new_agbno,
xfs_extlen_t *new_aglen,
- enum xfs_refc_adjust_op adj,
- struct xfs_owner_info *oinfo)
+ enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int shape_changes = 0;
@@ -1057,22 +1112,24 @@ xfs_refcount_adjust(
*new_agbno = agbno;
*new_aglen = aglen;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno, aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno, aglen);
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
- error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
shape_changes++;
- error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
+ agbno + aglen, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1081,25 +1138,24 @@ xfs_refcount_adjust(
/*
* Try to merge with the left or right extents of the range.
*/
- error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
- XFS_FIND_RCEXT_SHARED, &shape_changed);
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
+ new_agbno, new_aglen, adj, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
shape_changes++;
if (shape_changes)
- cur->bc_private.a.priv.refc.shape_changes++;
+ cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
- adj, oinfo);
+ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
if (error)
goto out_error;
return 0;
out_error:
- trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -1115,13 +1171,39 @@ xfs_refcount_finish_one_cleanup(
if (rcur == NULL)
return;
- agbp = rcur->bc_private.a.agbp;
+ agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
if (error)
xfs_trans_brelse(tp, agbp);
}
/*
+ * Set up a continuation a deferred refcount operation by updating the intent.
+ * Checks to make sure we're not going to run off the end of the AG.
+ */
+static inline int
+xfs_refcount_continue_op(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t startblock,
+ xfs_agblock_t new_agbno,
+ xfs_extlen_t new_len,
+ xfs_fsblock_t *new_fsbno)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+ return -EFSCORRUPTED;
+
+ *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+
+ ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+
+ return 0;
+}
+
+/*
* Process one of the deferred refcount operations. We pass back the
* btree cursor to maintain our lock on the btree between calls.
* This saves time and eliminates a buffer deadlock between the
@@ -1142,62 +1224,66 @@ xfs_refcount_finish_one(
struct xfs_btree_cur *rcur;
struct xfs_buf *agbp = NULL;
int error = 0;
- xfs_agnumber_t agno;
xfs_agblock_t bno;
xfs_agblock_t new_agbno;
unsigned long nr_ops = 0;
int shape_changes = 0;
+ struct xfs_perag *pag;
- agno = XFS_FSB_TO_AGNO(mp, startblock);
- ASSERT(agno != NULLAGNUMBER);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
bno = XFS_FSB_TO_AGBNO(mp, startblock);
trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
type, XFS_FSB_TO_AGBNO(mp, startblock),
blockcount);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_REFCOUNT_FINISH_ONE))
- return -EIO;
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
+ error = -EIO;
+ goto out_drop;
+ }
/*
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_private.a.agno != agno) {
- nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
- shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+ if (rcur != NULL && rcur->bc_ag.pag != pag) {
+ nr_ops = rcur->bc_ag.refc.nr_ops;
+ shape_changes = rcur->bc_ag.refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(tp->t_mountp, tp, agno,
- XFS_ALLOC_FLAG_FREEING, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
+ &agbp);
if (error)
- return error;
+ goto out_drop;
- rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
- if (!rcur) {
- error = -ENOMEM;
- goto out_cur;
- }
- rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
- rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+ rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ rcur->bc_ag.refc.nr_ops = nr_ops;
+ rcur->bc_ag.refc.shape_changes = shape_changes;
}
*pcur = rcur;
switch (type) {
case XFS_REFCOUNT_INCREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL);
- *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ new_len, XFS_REFCOUNT_ADJUST_INCREASE);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
break;
case XFS_REFCOUNT_DECREASE:
error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL);
- *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+ new_len, XFS_REFCOUNT_ADJUST_DECREASE);
+ if (error)
+ goto out_drop;
+ if (*new_len > 0)
+ error = xfs_refcount_continue_op(rcur, startblock,
+ new_agbno, *new_len, new_fsb);
break;
case XFS_REFCOUNT_ALLOC_COW:
*new_fsb = startblock + blockcount;
@@ -1214,13 +1300,10 @@ xfs_refcount_finish_one(
error = -EFSCORRUPTED;
}
if (!error && *new_len > 0)
- trace_xfs_refcount_finish_one_leftover(mp, agno, type,
+ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
bno, blockcount, new_agbno, *new_len);
- return error;
-
-out_cur:
- xfs_trans_brelse(tp, agbp);
-
+out_drop:
+ xfs_perag_put(pag);
return error;
}
@@ -1241,8 +1324,8 @@ __xfs_refcount_add(
type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
blockcount);
- ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
- KM_NOFS);
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
@@ -1259,7 +1342,7 @@ xfs_refcount_increase_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
+ if (!xfs_has_reflink(tp->t_mountp))
return;
__xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
@@ -1274,7 +1357,7 @@ xfs_refcount_decrease_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
- if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
+ if (!xfs_has_reflink(tp->t_mountp))
return;
__xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
@@ -1303,7 +1386,7 @@ xfs_refcount_find_shared(
int have;
int error;
- trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
agbno, aglen);
/* By default, skip the whole range */
@@ -1311,7 +1394,8 @@ xfs_refcount_find_shared(
*flen = 0;
/* Try to find a refcount extent that crosses the start */
- error = xfs_refcount_lookup_le(cur, agbno, &have);
+ error = xfs_refcount_lookup_le(cur, XFS_REFC_DOMAIN_SHARED, agbno,
+ &have);
if (error)
goto out_error;
if (!have) {
@@ -1329,6 +1413,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
/* If the extent ends before the start, look at the next one */
if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
@@ -1344,6 +1430,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED)
+ goto done;
}
/* If the extent starts after the range we want, bail out */
@@ -1375,7 +1463,8 @@ xfs_refcount_find_shared(
error = -EFSCORRUPTED;
goto out_error;
}
- if (tmp.rc_startblock >= agbno + aglen ||
+ if (tmp.rc_domain != XFS_REFC_DOMAIN_SHARED ||
+ tmp.rc_startblock >= agbno + aglen ||
tmp.rc_startblock != *fbno + *flen)
break;
*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
@@ -1383,12 +1472,12 @@ xfs_refcount_find_shared(
done:
trace_xfs_refcount_find_shared_result(cur->bc_mp,
- cur->bc_private.a.agno, *fbno, *flen);
+ cur->bc_ag.pag->pag_agno, *fbno, *flen);
out_error:
if (error)
trace_xfs_refcount_find_shared_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1459,17 +1548,23 @@ xfs_refcount_adjust_cow_extents(
return 0;
/* Find any overlapping refcount records */
- error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+ error = xfs_refcount_lookup_ge(cur, XFS_REFC_DOMAIN_COW, agbno,
+ &found_rec);
if (error)
goto out_error;
error = xfs_refcount_get_rec(cur, &ext, &found_rec);
if (error)
goto out_error;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
+ ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (!found_rec) {
- ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
- XFS_REFC_COW_START;
+ ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
ext.rc_blockcount = 0;
ext.rc_refcount = 0;
+ ext.rc_domain = XFS_REFC_DOMAIN_COW;
}
switch (adj) {
@@ -1484,8 +1579,10 @@ xfs_refcount_adjust_cow_extents(
tmp.rc_startblock = agbno;
tmp.rc_blockcount = aglen;
tmp.rc_refcount = 1;
+ tmp.rc_domain = XFS_REFC_DOMAIN_COW;
+
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &tmp);
+ cur->bc_ag.pag->pag_agno, &tmp);
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1513,7 +1610,7 @@ xfs_refcount_adjust_cow_extents(
ext.rc_refcount = 0;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &ext);
+ cur->bc_ag.pag->pag_agno, &ext);
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
@@ -1529,7 +1626,7 @@ xfs_refcount_adjust_cow_extents(
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1546,24 +1643,24 @@ xfs_refcount_adjust_cow(
bool shape_changed;
int error;
- agbno += XFS_REFC_COW_START;
-
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
- error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno, &shape_changed);
if (error)
goto out_error;
- error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_COW,
+ agbno + aglen, &shape_changed);
if (error)
goto out_error;
/*
* Try to merge with the left or right extents of the range.
*/
- error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
- XFS_FIND_RCEXT_COW, &shape_changed);
+ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_COW, &agbno,
+ &aglen, adj, &shape_changed);
if (error)
goto out_error;
@@ -1575,7 +1672,7 @@ xfs_refcount_adjust_cow(
return 0;
out_error:
- trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -1589,7 +1686,7 @@ __xfs_refcount_cow_alloc(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
agbno, aglen);
/* Add refcount btree reservation */
@@ -1606,7 +1703,7 @@ __xfs_refcount_cow_free(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
agbno, aglen);
/* Remove refcount btree reservation */
@@ -1623,7 +1720,7 @@ xfs_refcount_alloc_cow_extent(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return;
__xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
@@ -1642,7 +1739,7 @@ xfs_refcount_free_cow_extent(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return;
/* Remove rmap entry */
@@ -1660,7 +1757,7 @@ struct xfs_refcount_recovery {
STATIC int
xfs_refcount_recover_extent(
struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
+ const union xfs_btree_rec *rec,
void *priv)
{
struct list_head *debris = priv;
@@ -1670,10 +1767,18 @@ xfs_refcount_recover_extent(
be32_to_cpu(rec->refc.rc_refcount) != 1))
return -EFSCORRUPTED;
- rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
+ rr = kmalloc(sizeof(struct xfs_refcount_recovery),
+ GFP_KERNEL | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- list_add_tail(&rr->rr_list, debris);
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+ kfree(rr);
+ return -EFSCORRUPTED;
+ }
+
+ list_add_tail(&rr->rr_list, debris);
return 0;
}
@@ -1681,7 +1786,7 @@ xfs_refcount_recover_extent(
int
xfs_refcount_recover_cow_leftovers(
struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xfs_trans *tp;
struct xfs_btree_cur *cur;
@@ -1691,10 +1796,11 @@ xfs_refcount_recover_cow_leftovers(
union xfs_btree_irec low;
union xfs_btree_irec high;
xfs_fsblock_t fsb;
- xfs_agblock_t agbno;
int error;
- if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+ /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */
+ BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG);
+ if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS)
return -EOPNOTSUPP;
INIT_LIST_HEAD(&debris);
@@ -1713,15 +1819,15 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
goto out_trans;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
/* Find all the leftover CoW staging extents. */
memset(&low, 0, sizeof(low));
memset(&high, 0, sizeof(high));
- low.rc.rc_startblock = XFS_REFC_COW_START;
+ low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW;
high.rc.rc_startblock = -1U;
error = xfs_btree_query_range(cur, &low, &high,
xfs_refcount_recover_extent, &debris);
@@ -1738,23 +1844,24 @@ xfs_refcount_recover_cow_leftovers(
if (error)
goto out_free;
- trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec);
+ trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
+ &rr->rr_rrec);
/* Free the orphan record */
- agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
- fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
+ fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
+ rr->rr_rrec.rc_startblock);
xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
/* Free the block. */
- xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
+ xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
error = xfs_trans_commit(tp);
if (error)
goto out_free;
list_del(&rr->rr_list);
- kmem_free(rr);
+ kfree(rr);
}
return error;
@@ -1764,7 +1871,7 @@ out_free:
/* Free the leftover list */
list_for_each_entry_safe(rr, n, &debris, rr_list) {
list_del(&rr->rr_list);
- kmem_free(rr);
+ kfree(rr);
}
return error;
}
@@ -1773,6 +1880,7 @@ out_free:
int
xfs_refcount_has_record(
struct xfs_btree_cur *cur,
+ enum xfs_refc_domain domain,
xfs_agblock_t bno,
xfs_extlen_t len,
bool *exists)
@@ -1784,6 +1892,24 @@ xfs_refcount_has_record(
low.rc.rc_startblock = bno;
memset(&high, 0xFF, sizeof(high));
high.rc.rc_startblock = bno + len - 1;
+ low.rc.rc_domain = high.rc.rc_domain = domain;
return xfs_btree_has_record(cur, &low, &high, exists);
}
+
+int __init
+xfs_refcount_intent_init_cache(void)
+{
+ xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent",
+ sizeof(struct xfs_refcount_intent),
+ 0, 0, NULL);
+
+ return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_refcount_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_refcount_intent_cache);
+ xfs_refcount_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 209795539c8d..452f30556f5a 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -6,15 +6,41 @@
#ifndef __XFS_REFCOUNT_H__
#define __XFS_REFCOUNT_H__
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_btree_cur;
+struct xfs_bmbt_irec;
+struct xfs_refcount_irec;
+
extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, int *stat);
+ enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+static inline uint32_t
+xfs_refcount_encode_startblock(
+ xfs_agblock_t startblock,
+ enum xfs_refc_domain domain)
+{
+ uint32_t start;
+
+ /*
+ * low level btree operations need to handle the generic btree range
+ * query functions (which set rc_domain == -1U), so we check that the
+ * domain is /not/ shared.
+ */
+ start = startblock & ~XFS_REFC_COWFLAG;
+ if (domain != XFS_REFC_DOMAIN_SHARED)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
enum xfs_refcount_intent_type {
XFS_REFCOUNT_INCREASE = 1,
XFS_REFCOUNT_DECREASE,
@@ -25,10 +51,22 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
enum xfs_refcount_intent_type ri_type;
- xfs_fsblock_t ri_startblock;
xfs_extlen_t ri_blockcount;
+ xfs_fsblock_t ri_startblock;
};
+/* Check that the refcount is appropriate for the record domain. */
+static inline bool
+xfs_refcount_check_domain(
+ const struct xfs_refcount_irec *irec)
+{
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1)
+ return false;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2)
+ return false;
+ return true;
+}
+
void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
void xfs_refcount_decrease_extent(struct xfs_trans *tp,
@@ -50,7 +88,7 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
xfs_extlen_t len);
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
- xfs_agnumber_t agno);
+ struct xfs_perag *pag);
/*
* While we're adjusting the refcounts records of an extent, we have
@@ -60,20 +98,29 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
-static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
-{
- return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
-}
-
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+ enum xfs_refc_domain domain, xfs_agblock_t bno,
+ xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
-extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec,
+extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
+extern struct kmem_cache *xfs_refcount_intent_cache;
+
+int __init xfs_refcount_intent_init_cache(void);
+void xfs_refcount_intent_destroy_cache(void);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 38529dbacd55..e1f789866683 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -9,42 +9,44 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct kmem_cache *xfs_refcountbt_cur_cache;
static struct xfs_btree_cur *
xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_ag.agbp, cur->bc_ag.pag);
}
STATIC void
xfs_refcountbt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
- struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = agbp->b_pag;
ASSERT(ptr->s != 0);
agf->agf_refcount_root = ptr->s;
be32_add_cpu(&agf->agf_refcount_level, inc);
pag->pagf_refcount_level += inc;
- xfs_perag_put(pag);
xfs_alloc_log_agf(cur->bc_tp, agbp,
XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
@@ -52,13 +54,13 @@ xfs_refcountbt_set_root(
STATIC int
xfs_refcountbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_alloc_arg args; /* block allocation args */
int error; /* error return value */
@@ -66,7 +68,7 @@ xfs_refcountbt_alloc_block(
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
xfs_refc_block(args.mp));
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
@@ -75,13 +77,13 @@ xfs_refcountbt_alloc_block(
error = xfs_alloc_vextent(&args);
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
}
- ASSERT(args.agno == cur->bc_private.a.agno);
+ ASSERT(args.agno == cur->bc_ag.pag->pag_agno);
ASSERT(args.len == 1);
new->s = cpu_to_be32(args.agbno);
@@ -101,12 +103,12 @@ xfs_refcountbt_free_block(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
int error;
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
@@ -136,18 +138,18 @@ xfs_refcountbt_get_maxrecs(
STATIC void
xfs_refcountbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->refc.rc_startblock = rec->refc.rc_startblock;
}
STATIC void
xfs_refcountbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- __u32 x;
+ __u32 x;
x = be32_to_cpu(rec->refc.rc_startblock);
x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
@@ -159,7 +161,12 @@ xfs_refcountbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
{
- rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
+
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ rec->refc.rc_startblock = cpu_to_be32(start);
rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
}
@@ -169,29 +176,32 @@ xfs_refcountbt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_refcount_root;
}
STATIC int64_t
xfs_refcountbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- struct xfs_refcount_irec *rec = &cur->bc_rec.rc;
- struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_key *kp = &key->refc;
+ const struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ uint32_t start;
- return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+ start = xfs_refcount_encode_startblock(irec->rc_startblock,
+ irec->rc_domain);
+ return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
}
STATIC int64_t
xfs_refcountbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
be32_to_cpu(k2->refc.rc_startblock);
@@ -210,7 +220,7 @@ xfs_refcountbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return __this_address;
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
@@ -270,9 +280,9 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
STATIC int
xfs_refcountbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
return be32_to_cpu(k1->refc.rc_startblock) <
be32_to_cpu(k2->refc.rc_startblock);
@@ -280,9 +290,9 @@ xfs_refcountbt_keys_inorder(
STATIC int
xfs_refcountbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
return be32_to_cpu(r1->refc.rc_startblock) +
be32_to_cpu(r1->refc.rc_blockcount) <=
@@ -311,42 +321,102 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
};
/*
- * Allocate a new refcount btree cursor.
+ * Initialize a new refcount btree cursor.
*/
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
+static struct xfs_btree_cur *
+xfs_refcountbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
struct xfs_btree_cur *cur;
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agno < mp->m_sb.sb_agcount);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+ ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = XFS_BTNUM_REFC;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_refcountbt_ops;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+ mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
- cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
-
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.a.priv.refc.nr_ops = 0;
- cur->bc_private.a.priv.refc.shape_changes = 0;
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
+ cur->bc_ag.refc.nr_ops = 0;
+ cur->bc_ag.refc.shape_changes = 0;
+ cur->bc_ops = &xfs_refcountbt_ops;
+ return cur;
+}
+
+/* Create a btree cursor. */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_refcountbt_init_common(mp, tp, pag);
+ cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
+
+/* Create a btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_refcountbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+ cur = xfs_refcountbt_init_common(mp, NULL, pag);
+ xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
/*
+ * Swap in the new btree root. Once we pass this point the newly rebuilt btree
+ * is in place and we have to kill off all the old btree blocks.
+ */
+void
+xfs_refcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_refcount_root = cpu_to_be32(afake->af_root);
+ agf->agf_refcount_level = cpu_to_be32(afake->af_levels);
+ agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
+ XFS_AGF_REFCOUNT_ROOT |
+ XFS_AGF_REFCOUNT_LEVEL);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+}
+
+/* Calculate number of records in a refcount btree block. */
+static inline unsigned int
+xfs_refcountbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_refcount_rec);
+ return blocklen / (sizeof(struct xfs_refcount_key) +
+ sizeof(xfs_refcount_ptr_t));
+}
+
+/*
* Calculate the number of records in a refcount btree block.
*/
int
@@ -355,11 +425,22 @@ xfs_refcountbt_maxrecs(
bool leaf)
{
blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+ return xfs_refcountbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_refcount_rec);
- return blocklen / (sizeof(struct xfs_refcount_key) +
- sizeof(xfs_refcount_ptr_t));
+/* Compute the max possible height of the maximally sized refcount btree. */
+unsigned int
+xfs_refcountbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of a refcount btree. */
@@ -367,8 +448,14 @@ void
xfs_refcountbt_compute_maxlevels(
struct xfs_mount *mp)
{
+ if (!xfs_has_reflink(mp)) {
+ mp->m_refc_maxlevels = 0;
+ return;
+ }
+
mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(
mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+ ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -402,7 +489,7 @@ int
xfs_refcountbt_calc_reserves(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
@@ -412,15 +499,14 @@ xfs_refcountbt_calc_reserves(
xfs_extlen_t tree_len;
int error;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
-
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
- agf = XFS_BUF_TO_AGF(agbp);
+ agf = agbp->b_addr;
agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_trans_brelse(tp, agbp);
@@ -430,8 +516,7 @@ xfs_refcountbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
*ask += xfs_refcountbt_max_size(mp, agblocks);
@@ -439,3 +524,22 @@ xfs_refcountbt_calc_reserves(
return error;
}
+
+int __init
+xfs_refcountbt_init_cur_cache(void)
+{
+ xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur",
+ xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_refcountbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_refcountbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_refcountbt_cur_cache);
+ xfs_refcountbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index ba416f71c824..d66b37259bed 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -13,6 +13,8 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xfs_perag;
+struct xbtree_afakeroot;
/*
* Btree block header size
@@ -45,7 +47,9 @@ struct xfs_mount;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno);
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
@@ -55,7 +59,15 @@ extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
xfs_agblock_t agblocks);
extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
- struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask,
+ struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask,
xfs_extlen_t *used);
+void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
+unsigned int xfs_refcountbt_maxlevels_ondisk(void);
+
+int __init xfs_refcountbt_init_cur_cache(void);
+void xfs_refcountbt_destroy_cur_cache(void);
+
#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index ff9412f113c4..b56aca1e7c66 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -11,6 +11,7 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_sb.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
@@ -21,6 +22,9 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_inode.h"
+#include "xfs_ag.h"
+
+struct kmem_cache *xfs_rmap_intent_cache;
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
@@ -30,18 +34,32 @@ int
xfs_rmap_lookup_le(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
- xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags,
+ struct xfs_rmap_irec *irec,
int *stat)
{
+ int get_stat = 0;
+ int error;
+
cur->bc_rec.r.rm_startblock = bno;
- cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_blockcount = 0;
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ if (error || !(*stat) || !irec)
+ return error;
+
+ error = xfs_rmap_get_rec(cur, irec, &get_stat);
+ if (error)
+ return error;
+ if (!get_stat)
+ return -EFSCORRUPTED;
+
+ return 0;
}
/*
@@ -79,7 +97,7 @@ xfs_rmap_update(
union xfs_btree_rec rec;
int error;
- trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
irec->rm_startblock, irec->rm_blockcount,
irec->rm_owner, irec->rm_offset, irec->rm_flags);
@@ -91,7 +109,7 @@ xfs_rmap_update(
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_rmap_update_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -107,7 +125,7 @@ xfs_rmap_insert(
int i;
int error;
- trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
@@ -133,7 +151,7 @@ xfs_rmap_insert(
done:
if (error)
trace_xfs_rmap_insert_error(rcur->bc_mp,
- rcur->bc_private.a.agno, error, _RET_IP_);
+ rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -149,7 +167,7 @@ xfs_rmap_delete(
int i;
int error;
- trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
@@ -170,15 +188,15 @@ xfs_rmap_delete(
done:
if (error)
trace_xfs_rmap_delete_error(rcur->bc_mp,
- rcur->bc_private.a.agno, error, _RET_IP_);
+ rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
/* Convert an internal btree record to an rmap record. */
int
xfs_rmap_btrec_to_irec(
- union xfs_btree_rec *rec,
- struct xfs_rmap_irec *irec)
+ const union xfs_btree_rec *rec,
+ struct xfs_rmap_irec *irec)
{
irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
@@ -197,7 +215,7 @@ xfs_rmap_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
@@ -217,13 +235,8 @@ xfs_rmap_get_rec(
goto out_bad_rec;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, irec->rm_startblock))
- goto out_bad_rec;
- if (irec->rm_startblock >
- irec->rm_startblock + irec->rm_blockcount)
- goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno,
- irec->rm_startblock + irec->rm_blockcount - 1))
+ if (!xfs_verify_agbext(pag, irec->rm_startblock,
+ irec->rm_blockcount))
goto out_bad_rec;
}
@@ -236,7 +249,7 @@ xfs_rmap_get_rec(
out_bad_rec:
xfs_warn(mp,
"Reverse Mapping BTree record corruption in AG %d detected!",
- agno);
+ pag->pag_agno);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
@@ -247,20 +260,19 @@ out_bad_rec:
struct xfs_find_left_neighbor_info {
struct xfs_rmap_irec high;
struct xfs_rmap_irec *irec;
- int *stat;
};
/* For each rmap given, figure out if it matches the key we want. */
STATIC int
xfs_rmap_find_left_neighbor_helper(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
- void *priv)
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
{
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
- cur->bc_private.a.agno, rec->rm_startblock,
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -272,7 +284,6 @@ xfs_rmap_find_left_neighbor_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -281,7 +292,7 @@ xfs_rmap_find_left_neighbor_helper(
* return a match with the same owner and adjacent physical and logical
* block ranges.
*/
-int
+STATIC int
xfs_rmap_find_left_neighbor(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
@@ -292,6 +303,7 @@ xfs_rmap_find_left_neighbor(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
*stat = 0;
@@ -309,34 +321,57 @@ xfs_rmap_find_left_neighbor(
info.high.rm_flags = flags;
info.high.rm_blockcount = 0;
info.irec = irec;
- info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
- cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_find_left_neighbor_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * extent conversion and remap operations run a bit faster if the
+ * physical extents aren't being shared. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/* For each rmap given, figure out if it matches the key we want. */
STATIC int
xfs_rmap_lookup_le_range_helper(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
- void *priv)
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
{
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
- cur->bc_private.a.agno, rec->rm_startblock,
+ cur->bc_ag.pag->pag_agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -349,7 +384,6 @@ xfs_rmap_lookup_le_range_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -370,6 +404,7 @@ xfs_rmap_lookup_le_range(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
info.high.rm_startblock = bno;
@@ -382,20 +417,44 @@ xfs_rmap_lookup_le_range(
info.high.rm_blockcount = 0;
*stat = 0;
info.irec = irec;
- info.stat = stat;
-
- trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_private.a.agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_lookup_le_range_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * scrub run much faster on most filesystems because bmbt records are
+ * usually an exact match for rmap records. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/*
@@ -498,7 +557,7 @@ xfs_rmap_unmap(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -506,7 +565,7 @@ xfs_rmap_unmap(
* for the AG headers at rm_startblock == 0 created by mkfs/growfs that
* will not ever be removed from the tree.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -514,15 +573,8 @@ xfs_rmap_unmap(
goto out_error;
}
- error = xfs_rmap_get_rec(cur, &ltrec, &i);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, ltrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
ltoff = ltrec.rm_offset;
@@ -588,7 +640,7 @@ xfs_rmap_unmap(
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* exact match, simply remove the record from rmap tree */
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
ltrec.rm_startblock, ltrec.rm_blockcount,
ltrec.rm_owner, ltrec.rm_offset,
ltrec.rm_flags);
@@ -666,7 +718,7 @@ xfs_rmap_unmap(
else
cur->bc_rec.r.rm_offset = offset + len;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
cur->bc_rec.r.rm_startblock,
cur->bc_rec.r.rm_blockcount,
cur->bc_rec.r.rm_owner,
@@ -678,11 +730,11 @@ xfs_rmap_unmap(
}
out_done:
- trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -694,7 +746,7 @@ int
xfs_rmap_free(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo)
@@ -703,10 +755,10 @@ xfs_rmap_free(
struct xfs_btree_cur *cur;
int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
- cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
@@ -773,7 +825,7 @@ xfs_rmap_map(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
@@ -782,20 +834,13 @@ xfs_rmap_map(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
&have_lt);
if (error)
goto out_error;
if (have_lt) {
- error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, ltrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
@@ -831,7 +876,7 @@ xfs_rmap_map(
goto out_error;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, gtrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
@@ -870,7 +915,7 @@ xfs_rmap_map(
* result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
*/
ltrec.rm_blockcount += gtrec.rm_blockcount;
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
gtrec.rm_startblock,
gtrec.rm_blockcount,
gtrec.rm_owner,
@@ -921,7 +966,7 @@ xfs_rmap_map(
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
owner, offset, flags);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -932,11 +977,11 @@ xfs_rmap_map(
}
}
- trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
error, _RET_IP_);
return error;
}
@@ -948,7 +993,7 @@ int
xfs_rmap_alloc(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo)
@@ -957,10 +1002,10 @@ xfs_rmap_alloc(
struct xfs_btree_cur *cur;
int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
- cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1010,7 +1055,7 @@ xfs_rmap_convert(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -1018,7 +1063,7 @@ xfs_rmap_convert(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1026,15 +1071,8 @@ xfs_rmap_convert(
goto done;
}
- error = xfs_rmap_get_rec(cur, &PREV, &i);
- if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, PREV.rm_startblock,
+ cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
PREV.rm_offset, PREV.rm_flags);
@@ -1076,7 +1114,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, LEFT.rm_startblock,
+ cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
LEFT.rm_blockcount, LEFT.rm_owner,
LEFT.rm_offset, LEFT.rm_flags);
if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
@@ -1114,7 +1152,7 @@ xfs_rmap_convert(
goto done;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, RIGHT.rm_startblock,
+ cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (bno + len == RIGHT.rm_startblock &&
@@ -1132,11 +1170,11 @@ xfs_rmap_convert(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
_RET_IP_);
/* reset the cursor back to PREV */
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1162,7 +1200,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
@@ -1180,7 +1218,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
@@ -1210,7 +1248,7 @@ xfs_rmap_convert(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
@@ -1247,7 +1285,7 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
@@ -1326,7 +1364,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1383,7 +1421,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1414,7 +1452,7 @@ xfs_rmap_convert(
NEW = PREV;
NEW.rm_blockcount = offset - PREV.rm_offset;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
NEW.rm_startblock, NEW.rm_blockcount,
NEW.rm_owner, NEW.rm_offset,
NEW.rm_flags);
@@ -1441,7 +1479,7 @@ xfs_rmap_convert(
/* new middle extent - newext */
cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
cur->bc_rec.r.rm_flags |= newext;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1465,12 +1503,12 @@ xfs_rmap_convert(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1506,7 +1544,7 @@ xfs_rmap_convert_shared(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -1514,7 +1552,7 @@ xfs_rmap_convert_shared(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, oldext,
&PREV, &i);
if (error)
goto done;
@@ -1573,7 +1611,7 @@ xfs_rmap_convert_shared(
goto done;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, RIGHT.rm_startblock,
+ cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1589,7 +1627,7 @@ xfs_rmap_convert_shared(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
_RET_IP_);
/*
* Switch out based on the FILLING and CONTIG state bits.
@@ -1880,12 +1918,12 @@ xfs_rmap_convert_shared(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -1923,7 +1961,7 @@ xfs_rmap_unmap_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/*
@@ -2072,12 +2110,12 @@ xfs_rmap_unmap_shared(
goto out_error;
}
- trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_unmap_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -2112,7 +2150,7 @@ xfs_rmap_map_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
/* Is there a left record that abuts our range? */
@@ -2138,7 +2176,7 @@ xfs_rmap_map_shared(
goto out_error;
}
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, gtrec.rm_startblock,
+ cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
@@ -2231,12 +2269,12 @@ xfs_rmap_map_shared(
goto out_error;
}
- trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_map_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.pag->pag_agno, error, _RET_IP_);
return error;
}
@@ -2276,9 +2314,9 @@ struct xfs_rmap_query_range_info {
/* Format btree record and pass to our callback. */
STATIC int
xfs_rmap_query_range_helper(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *rec,
- void *priv)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
{
struct xfs_rmap_query_range_info *query = priv;
struct xfs_rmap_irec irec;
@@ -2294,8 +2332,8 @@ xfs_rmap_query_range_helper(
int
xfs_rmap_query_range(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec,
- struct xfs_rmap_irec *high_rec,
+ const struct xfs_rmap_irec *low_rec,
+ const struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn,
void *priv)
{
@@ -2336,7 +2374,7 @@ xfs_rmap_finish_one_cleanup(
if (rcur == NULL)
return;
- agbp = rcur->bc_private.a.agbp;
+ agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
if (error)
xfs_trans_brelse(tp, agbp);
@@ -2362,31 +2400,32 @@ xfs_rmap_finish_one(
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
struct xfs_btree_cur *rcur;
struct xfs_buf *agbp = NULL;
int error = 0;
- xfs_agnumber_t agno;
struct xfs_owner_info oinfo;
xfs_agblock_t bno;
bool unwritten;
- agno = XFS_FSB_TO_AGNO(mp, startblock);
- ASSERT(agno != NULLAGNUMBER);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
bno = XFS_FSB_TO_AGBNO(mp, startblock);
- trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork,
+ trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
startoff, blockcount, state);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_RMAP_FINISH_ONE))
- return -EIO;
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
+ error = -EIO;
+ goto out_drop;
+ }
+
/*
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ if (rcur != NULL && rcur->bc_ag.pag != pag) {
xfs_rmap_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -2397,17 +2436,15 @@ xfs_rmap_finish_one(
* rmapbt, because a shape change could cause us to
* allocate blocks.
*/
- error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error)
- return error;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
- return -EFSCORRUPTED;
-
- rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
- if (!rcur) {
- error = -ENOMEM;
- goto out_cur;
+ goto out_drop;
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+ error = -EFSCORRUPTED;
+ goto out_drop;
}
+
+ rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
}
*pcur = rcur;
@@ -2445,11 +2482,8 @@ xfs_rmap_finish_one(
ASSERT(0);
error = -EFSCORRUPTED;
}
- return error;
-
-out_cur:
- xfs_trans_brelse(tp, agbp);
-
+out_drop:
+ xfs_perag_put(pag);
return error;
}
@@ -2461,7 +2495,7 @@ xfs_rmap_update_is_needed(
struct xfs_mount *mp,
int whichfork)
{
- return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;
+ return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK;
}
/*
@@ -2487,7 +2521,7 @@ __xfs_rmap_add(
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2505,12 +2539,15 @@ xfs_rmap_map_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_MAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -2521,12 +2558,15 @@ xfs_rmap_unmap_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_UNMAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/*
@@ -2543,12 +2583,15 @@ xfs_rmap_convert_extent(
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+
if (!xfs_rmap_update_is_needed(mp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_CONVERT_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
@@ -2668,7 +2711,7 @@ xfs_rmap_record_exists(
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
(flags & XFS_RMAP_BMBT_BLOCK));
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
&has_record);
if (error)
return error;
@@ -2677,14 +2720,6 @@ xfs_rmap_record_exists(
return 0;
}
- error = xfs_rmap_get_rec(cur, &irec, &has_record);
- if (error)
- return error;
- if (!has_record) {
- *has_rmap = false;
- return 0;
- }
-
*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
@@ -2694,14 +2729,13 @@ struct xfs_rmap_key_state {
uint64_t owner;
uint64_t offset;
unsigned int flags;
- bool has_rmap;
};
/* For each rmap given, figure out if it doesn't match the key we want. */
STATIC int
xfs_rmap_has_other_keys_helper(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xfs_rmap_key_state *rks = priv;
@@ -2709,7 +2743,6 @@ xfs_rmap_has_other_keys_helper(
if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
return 0;
- rks->has_rmap = true;
return -ECANCELED;
}
@@ -2731,7 +2764,7 @@ xfs_rmap_has_other_keys(
int error;
xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
- rks.has_rmap = false;
+ *has_rmap = false;
low.rm_startblock = bno;
memset(&high, 0xFF, sizeof(high));
@@ -2739,11 +2772,12 @@ xfs_rmap_has_other_keys(
error = xfs_rmap_query_range(cur, &low, &high,
xfs_rmap_has_other_keys_helper, &rks);
- if (error < 0)
- return error;
+ if (error == -ECANCELED) {
+ *has_rmap = true;
+ return 0;
+ }
- *has_rmap = rks.has_rmap;
- return 0;
+ return error;
}
const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
@@ -2773,3 +2807,20 @@ const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
.oi_owner = XFS_RMAP_OWN_COW,
};
+
+int __init
+xfs_rmap_intent_init_cache(void)
+{
+ xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent",
+ sizeof(struct xfs_rmap_intent),
+ 0, 0, NULL);
+
+ return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_rmap_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_rmap_intent_cache);
+ xfs_rmap_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index abe633403fd1..54741a591a17 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -6,6 +6,8 @@
#ifndef __XFS_RMAP_H__
#define __XFS_RMAP_H__
+struct xfs_perag;
+
static inline void
xfs_rmap_ino_bmbt_owner(
struct xfs_owner_info *oi,
@@ -113,15 +115,15 @@ xfs_owner_info_pack(
}
int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
const struct xfs_owner_info *oinfo);
int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
+ struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len,
const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, uint64_t owner, uint64_t offset,
- unsigned int flags, int *stat);
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
unsigned int flags, int *stat);
@@ -132,12 +134,13 @@ int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec,
int *stat);
typedef int (*xfs_rmap_query_range_fn)(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
- void *priv);
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv);
int xfs_rmap_query_range(struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
+ const struct xfs_rmap_irec *low_rec,
+ const struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn, void *priv);
int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
void *priv);
@@ -156,8 +159,8 @@ enum xfs_rmap_intent_type {
struct xfs_rmap_intent {
struct list_head ri_list;
enum xfs_rmap_intent_type ri_type;
- uint64_t ri_owner;
int ri_whichfork;
+ uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
};
@@ -181,16 +184,13 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
-int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- uint64_t owner, uint64_t offset, unsigned int flags,
- struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_compare(const struct xfs_rmap_irec *a,
const struct xfs_rmap_irec *b);
union xfs_btree_rec;
-int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec,
+int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, bool *exists);
@@ -212,4 +212,9 @@ extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+extern struct kmem_cache *xfs_rmap_intent_cache;
+
+int __init xfs_rmap_intent_init_cache(void);
+void xfs_rmap_intent_destroy_cache(void);
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index fc78efa52c94..7f83f62e51e0 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -9,18 +9,21 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+static struct kmem_cache *xfs_rmapbt_cur_cache;
+
/*
* Reverse map btree.
*
@@ -51,65 +54,60 @@ xfs_rmapbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_ag.agbp, cur->bc_ag.pag);
}
STATIC void
xfs_rmapbt_set_root(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr,
- int inc)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr,
+ int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
int btnum = cur->bc_btnum;
- struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
ASSERT(ptr->s != 0);
agf->agf_roots[btnum] = ptr->s;
be32_add_cpu(&agf->agf_levels[btnum], inc);
- pag->pagf_levels[btnum] += inc;
- xfs_perag_put(pag);
+ cur->bc_ag.pag->pagf_levels[btnum] += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
STATIC int
xfs_rmapbt_alloc_block(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *start,
- union xfs_btree_ptr *new,
- int *stat)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int *stat)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = cur->bc_ag.pag;
int error;
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
- bno, 1);
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
- false);
+ xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
- xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
be32_add_cpu(&agf->agf_rmap_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
+ xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
*stat = 1;
return 0;
@@ -120,26 +118,25 @@ xfs_rmapbt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = cur->bc_ag.pag;
xfs_agblock_t bno;
int error;
- bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
+ trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
- xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
- xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
- xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
return 0;
}
@@ -161,8 +158,8 @@ xfs_rmapbt_get_maxrecs(
STATIC void
xfs_rmapbt_init_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
key->rmap.rm_startblock = rec->rmap.rm_startblock;
key->rmap.rm_owner = rec->rmap.rm_owner;
@@ -178,11 +175,11 @@ xfs_rmapbt_init_key_from_rec(
*/
STATIC void
xfs_rmapbt_init_high_key_from_rec(
- union xfs_btree_key *key,
- union xfs_btree_rec *rec)
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
{
- uint64_t off;
- int adj;
+ uint64_t off;
+ int adj;
adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
@@ -215,22 +212,22 @@ xfs_rmapbt_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
STATIC int64_t
xfs_rmapbt_key_diff(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *key)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
{
- struct xfs_rmap_irec *rec = &cur->bc_rec.r;
- struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
+ struct xfs_rmap_irec *rec = &cur->bc_rec.r;
+ const struct xfs_rmap_key *kp = &key->rmap;
+ __u64 x, y;
+ int64_t d;
d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
if (d)
@@ -254,14 +251,14 @@ xfs_rmapbt_key_diff(
STATIC int64_t
xfs_rmapbt_diff_two_keys(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
- struct xfs_rmap_key *kp1 = &k1->rmap;
- struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ const struct xfs_rmap_key *kp1 = &k1->rmap;
+ const struct xfs_rmap_key *kp2 = &k2->rmap;
+ int64_t d;
+ __u64 x, y;
d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
be32_to_cpu(kp2->rm_startblock);
@@ -309,7 +306,7 @@ xfs_rmapbt_verify(
if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return __this_address;
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
@@ -369,9 +366,9 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
STATIC int
xfs_rmapbt_keys_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_key *k1,
- union xfs_btree_key *k2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
{
uint32_t x;
uint32_t y;
@@ -399,9 +396,9 @@ xfs_rmapbt_keys_inorder(
STATIC int
xfs_rmapbt_recs_inorder(
- struct xfs_btree_cur *cur,
- union xfs_btree_rec *r1,
- union xfs_btree_rec *r2)
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
{
uint32_t x;
uint32_t y;
@@ -448,37 +445,95 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.recs_inorder = xfs_rmapbt_recs_inorder,
};
-/*
- * Allocate a new allocation btree cursor.
- */
-struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
+static struct xfs_btree_cur *
+xfs_rmapbt_init_common(
struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
struct xfs_btree_cur *cur;
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
/* Overlapping btree; 2 keys per pointer. */
- cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+ mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_ops = &xfs_rmapbt_ops;
+
+ /* take a reference for the cursor */
+ atomic_inc(&pag->pag_ref);
+ cur->bc_ag.pag = pag;
+
+ return cur;
+}
+
+/* Create a new reverse mapping btree cursor. */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_perag *pag)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_rmapbt_init_common(mp, tp, pag);
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
+/* Create a new reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rmapbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+ cur = xfs_rmapbt_init_common(mp, NULL, pag);
+ xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
/*
+ * Install a new reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rmapbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
+ XFS_AGF_RMAP_BLOCKS);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+}
+
+/* Calculate number of records in a reverse mapping btree block. */
+static inline unsigned int
+xfs_rmapbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct xfs_rmap_rec);
+ return blocklen /
+ (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+}
+
+/*
* Calculate number of records in an rmap btree block.
*/
int
@@ -487,11 +542,33 @@ xfs_rmapbt_maxrecs(
int leaf)
{
blocklen -= XFS_RMAP_BLOCK_LEN;
+ return xfs_rmapbt_block_maxrecs(blocklen, leaf);
+}
- if (leaf)
- return blocklen / sizeof(struct xfs_rmap_rec);
- return blocklen /
- (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
+/* Compute the max possible height for reverse mapping btrees. */
+unsigned int
+xfs_rmapbt_maxlevels_ondisk(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
+
+ minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
+
+ /*
+ * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32 (per the
+ * refcount record format) owners, which means that theoretically we
+ * could face up to 2^64 rmap records. However, we're likely to run
+ * out of blocks in the AG long before that happens, which means that
+ * we must compute the max height based on what the btree will look
+ * like if it consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
}
/* Compute the maximum height of an rmap btree. */
@@ -499,26 +576,36 @@ void
xfs_rmapbt_compute_maxlevels(
struct xfs_mount *mp)
{
- /*
- * On a non-reflink filesystem, the maximum number of rmap
- * records is the number of blocks in the AG, hence the max
- * rmapbt height is log_$maxrecs($agblocks). However, with
- * reflink each AG block can have up to 2^32 (per the refcount
- * record format) owners, which means that theoretically we
- * could face up to 2^64 rmap records.
- *
- * That effectively means that the max rmapbt height must be
- * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
- * blocks to feed the rmapbt long before the rmapbt reaches
- * maximum height. The reflink code uses ag_resv_critical to
- * disallow reflinking when less than 10% of the per-AG metadata
- * block reservation since the fallback is a regular file copy.
- */
- if (xfs_sb_version_hasreflink(&mp->m_sb))
- mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
- else
+ if (!xfs_has_rmapbt(mp)) {
+ mp->m_rmap_maxlevels = 0;
+ return;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * Compute the asymptotic maxlevels for an rmap btree on a
+ * filesystem that supports reflink.
+ *
+ * On a reflink filesystem, each AG block can have up to 2^32
+ * (per the refcount record format) owners, which means that
+ * theoretically we could face up to 2^64 rmap records.
+ * However, we're likely to run out of blocks in the AG long
+ * before that happens, which means that we must compute the
+ * max height based on what the btree will look like if it
+ * consumes almost all the blocks in the AG due to maximal
+ * sharing factor.
+ */
+ mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+ mp->m_sb.sb_agblocks);
+ } else {
+ /*
+ * If there's no block sharing, compute the maximum rmapbt
+ * height assuming one rmap record per AG block.
+ */
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+ }
+ ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
}
/* Calculate the refcount btree size for some records. */
@@ -552,7 +639,7 @@ int
xfs_rmapbt_calc_reserves(
struct xfs_mount *mp,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
@@ -562,14 +649,14 @@ xfs_rmapbt_calc_reserves(
xfs_extlen_t tree_len;
int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
- agf = XFS_BUF_TO_AGF(agbp);
+ agf = agbp->b_addr;
agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_trans_brelse(tp, agbp);
@@ -579,8 +666,7 @@ xfs_rmapbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
/* Reserve 1% of the AG or enough for 1 block per record. */
@@ -589,3 +675,22 @@ xfs_rmapbt_calc_reserves(
return error;
}
+
+int __init
+xfs_rmapbt_init_cur_cache(void)
+{
+ xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
+ xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
+ 0, 0, NULL);
+
+ if (!xfs_rmapbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_rmapbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(xfs_rmapbt_cur_cache);
+ xfs_rmapbt_cur_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 820d668b063d..3244715dd111 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -9,6 +9,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/* rmaps only exist on crc enabled filesystems */
#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
@@ -42,7 +43,11 @@ struct xfs_mount;
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
- xfs_agnumber_t agno);
+ struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, struct xfs_perag *pag);
+void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
@@ -52,6 +57,11 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
xfs_agblock_t agblocks);
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+ struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
-#endif /* __XFS_RMAP_BTREE_H__ */
+unsigned int xfs_rmapbt_maxlevels_ondisk(void);
+
+int __init xfs_rmapbt_init_cur_cache(void);
+void xfs_rmapbt_destroy_cur_cache(void);
+
+#endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f42c74cb8be5..fa180ab66b73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -56,9 +56,9 @@ xfs_rtbuf_get(
xfs_trans_t *tp, /* transaction pointer */
xfs_rtblock_t block, /* block number in bitmap or summary */
int issum, /* is summary not bitmap */
- xfs_buf_t **bpp) /* output: buffer for the block */
+ struct xfs_buf **bpp) /* output: buffer for the block */
{
- xfs_buf_t *bp; /* block buffer, result */
+ struct xfs_buf *bp; /* block buffer, result */
xfs_inode_t *ip; /* bitmap or summary inode */
xfs_bmbt_irec_t map;
int nmap = 1;
@@ -66,11 +66,11 @@ xfs_rtbuf_get(
ip = issum ? mp->m_rsumip : mp->m_rbmip;
- error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+ error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
if (error)
return error;
- if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map)))
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
return -EFSCORRUPTED;
ASSERT(map.br_startblock != NULLFSBLOCK);
@@ -101,7 +101,7 @@ xfs_rtfind_back(
xfs_rtword_t *b; /* current word in buffer */
int bit; /* bit number in the word */
xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
+ struct xfs_buf *bp; /* buf for the block */
xfs_rtword_t *bufp; /* starting word in buffer */
int error; /* error value */
xfs_rtblock_t firstbit; /* first useful bit in the word */
@@ -276,7 +276,7 @@ xfs_rtfind_forw(
xfs_rtword_t *b; /* current word in buffer */
int bit; /* bit number in the word */
xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
+ struct xfs_buf *bp; /* buf for the block */
xfs_rtword_t *bufp; /* starting word in buffer */
int error; /* error value */
xfs_rtblock_t i; /* current bit number rel. to start */
@@ -447,11 +447,11 @@ xfs_rtmodify_summary_int(
int log, /* log2 of extent size */
xfs_rtblock_t bbno, /* bitmap block number */
int delta, /* change to make to summary info */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_suminfo_t *sum) /* out: summary info for this block */
{
- xfs_buf_t *bp; /* buffer for the summary block */
+ struct xfs_buf *bp; /* buffer for the summary block */
int error; /* error value */
xfs_fsblock_t sb; /* summary fsblock */
int so; /* index into the summary file */
@@ -517,7 +517,7 @@ xfs_rtmodify_summary(
int log, /* log2 of extent size */
xfs_rtblock_t bbno, /* bitmap block number */
int delta, /* change to make to summary info */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb) /* in/out: summary block number */
{
return xfs_rtmodify_summary_int(mp, tp, log, bbno,
@@ -539,7 +539,7 @@ xfs_rtmodify_range(
xfs_rtword_t *b; /* current word in buffer */
int bit; /* bit number in the word */
xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
+ struct xfs_buf *bp; /* buf for the block */
xfs_rtword_t *bufp; /* starting word in buffer */
int error; /* error value */
xfs_rtword_t *first; /* first used word in the buffer */
@@ -690,7 +690,7 @@ xfs_rtfree_range(
xfs_trans_t *tp, /* transaction pointer */
xfs_rtblock_t start, /* starting block to free */
xfs_extlen_t len, /* length to free */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb) /* in/out: summary block number */
{
xfs_rtblock_t end; /* end of the freed extent */
@@ -773,7 +773,7 @@ xfs_rtcheck_range(
xfs_rtword_t *b; /* current word in buffer */
int bit; /* bit number in the word */
xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
+ struct xfs_buf *bp; /* buf for the block */
xfs_rtword_t *bufp; /* starting word in buffer */
int error; /* error value */
xfs_rtblock_t i; /* current bit number rel. to start */
@@ -969,7 +969,7 @@ xfs_rtfree_extent(
int error; /* error value */
xfs_mount_t *mp; /* file system mount structure */
xfs_fsblock_t sb; /* summary file block number */
- xfs_buf_t *sumbp = NULL; /* summary file block buffer */
+ struct xfs_buf *sumbp = NULL; /* summary file block buffer */
mp = tp->t_mountp;
@@ -997,8 +997,8 @@ xfs_rtfree_extent(
*/
if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
mp->m_sb.sb_rextents) {
- if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
- mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
+ mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
*(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
@@ -1008,17 +1008,17 @@ xfs_rtfree_extent(
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_rtalloc_rec *low_rec,
- struct xfs_rtalloc_rec *high_rec,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
struct xfs_rtalloc_rec rec;
- struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
- xfs_rtblock_t rem;
+ xfs_rtblock_t high_key;
int is_free;
int error = 0;
@@ -1027,13 +1027,12 @@ xfs_rtalloc_query_range(
if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
low_rec->ar_startext == high_rec->ar_startext)
return 0;
- if (high_rec->ar_startext > mp->m_sb.sb_rextents)
- high_rec->ar_startext = mp->m_sb.sb_rextents;
+
+ high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */
rtstart = low_rec->ar_startext;
- rem = high_rec->ar_startext - rtstart;
- while (rem) {
+ while (rtstart <= high_key) {
/* Is the first block free? */
error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
&is_free);
@@ -1041,8 +1040,7 @@ xfs_rtalloc_query_range(
break;
/* How long does the extent go for? */
- error = xfs_rtfind_forw(mp, tp, rtstart,
- high_rec->ar_startext - 1, &rtend);
+ error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
if (error)
break;
@@ -1050,12 +1048,11 @@ xfs_rtalloc_query_range(
rec.ar_startext = rtstart;
rec.ar_extcount = rtend - rtstart + 1;
- error = fn(tp, &rec, priv);
+ error = fn(mp, tp, &rec, priv);
if (error)
break;
}
- rem -= rtend - rtstart + 1;
rtstart = rtend + 1;
}
@@ -1065,6 +1062,7 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
@@ -1072,10 +1070,10 @@ xfs_rtalloc_query_all(
struct xfs_rtalloc_rec keys[2];
keys[0].ar_startext = 0;
- keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
keys[0].ar_extcount = keys[1].ar_extcount = 0;
- return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+ return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
}
/* Is the given extent all free? */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 2f60fc3c99a0..a20cade590e9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -15,7 +15,6 @@
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_bmap_btree.h"
@@ -25,70 +24,157 @@
#include "xfs_refcount_btree.h"
#include "xfs_da_format.h"
#include "xfs_health.h"
+#include "xfs_ag.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
*/
/*
- * Reference counting access wrappers to the perag structures.
- * Because we never free per-ag structures, the only thing we
- * have to protect against changes is the tree structure itself.
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
*/
-struct xfs_perag *
-xfs_perag_get(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
+static bool
+xfs_sb_validate_v5_features(
+ struct xfs_sb *sbp)
{
- struct xfs_perag *pag;
- int ref = 0;
+ /* We must not have any unknown V4 feature bits set */
+ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+ return false;
- rcu_read_lock();
- pag = radix_tree_lookup(&mp->m_perag_tree, agno);
- if (pag) {
- ASSERT(atomic_read(&pag->pag_ref) >= 0);
- ref = atomic_inc_return(&pag->pag_ref);
- }
- rcu_read_unlock();
- trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
- return pag;
+ /*
+ * The CRC bit is considered an invalid V4 flag, so we have to add it
+ * manually to the OKBITS mask.
+ */
+ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+ XFS_SB_VERSION2_CRCBIT))
+ return false;
+
+ /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_CRCBIT)
+
+ if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+ return false;
+ if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+ return false;
+ return true;
}
/*
- * search from @first to find the next perag with the given tag set.
+ * We support all XFS versions newer than a v4 superblock with V2 directories.
*/
-struct xfs_perag *
-xfs_perag_get_tag(
- struct xfs_mount *mp,
- xfs_agnumber_t first,
- int tag)
+bool
+xfs_sb_good_version(
+ struct xfs_sb *sbp)
{
- struct xfs_perag *pag;
- int found;
- int ref;
-
- rcu_read_lock();
- found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
- (void **)&pag, first, 1, tag);
- if (found <= 0) {
- rcu_read_unlock();
- return NULL;
- }
- ref = atomic_inc_return(&pag->pag_ref);
- rcu_read_unlock();
- trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
- return pag;
+ /*
+ * All v5 filesystems are supported, but we must check that all the
+ * required v4 feature flags are enabled correctly as the code checks
+ * those flags and not for v5 support.
+ */
+ if (xfs_sb_is_v5(sbp))
+ return xfs_sb_validate_v5_features(sbp);
+
+ /* We must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
+
+ /* versions prior to v4 are not supported */
+ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
+ return false;
+
+ /* V4 filesystems need v2 directories and unwritten extents */
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+ return false;
+
+ /* It's a supported v4 filesystem */
+ return true;
}
-void
-xfs_perag_put(
- struct xfs_perag *pag)
+uint64_t
+xfs_sb_version_to_features(
+ struct xfs_sb *sbp)
{
- int ref;
+ uint64_t features = 0;
+
+ /* optional V4 features */
+ if (sbp->sb_rblocks > 0)
+ features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+ features |= XFS_FEAT_NLINK;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
+ features |= XFS_FEAT_ATTR;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
+ features |= XFS_FEAT_QUOTA;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)
+ features |= XFS_FEAT_ALIGN;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)
+ features |= XFS_FEAT_LOGV2;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT)
+ features |= XFS_FEAT_DALIGN;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)
+ features |= XFS_FEAT_EXTFLG;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT)
+ features |= XFS_FEAT_SECTOR;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT)
+ features |= XFS_FEAT_ASCIICI;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
+ if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
+ features |= XFS_FEAT_LAZYSBCOUNT;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
+ features |= XFS_FEAT_ATTR2;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
+ features |= XFS_FEAT_PROJID32;
+ if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
+ features |= XFS_FEAT_FTYPE;
+ }
- ASSERT(atomic_read(&pag->pag_ref) > 0);
- ref = atomic_dec_return(&pag->pag_ref);
- trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+ if (!xfs_sb_is_v5(sbp))
+ return features;
+
+ /* Always on V5 features */
+ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
+ XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+ XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
+
+ /* Optional V5 features */
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT)
+ features |= XFS_FEAT_FINOBT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+ features |= XFS_FEAT_RMAPBT;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ features |= XFS_FEAT_REFLINK;
+ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
+ features |= XFS_FEAT_INOBTCNT;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
+ features |= XFS_FEAT_FTYPE;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
+ features |= XFS_FEAT_SPINODES;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+ features |= XFS_FEAT_META_UUID;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME)
+ features |= XFS_FEAT_BIGTIME;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ features |= XFS_FEAT_NEEDSREPAIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+ features |= XFS_FEAT_NREXT64;
+
+ return features;
}
/* Check all the superblock fields we care about when reading one in. */
@@ -97,7 +183,7 @@ xfs_validate_sb_read(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5)
+ if (!xfs_sb_is_v5(sbp))
return 0;
/*
@@ -117,7 +203,7 @@ xfs_validate_sb_read(
"Superblock has unknown read-only compatible features (0x%x) enabled.",
(sbp->sb_features_ro_compat &
XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ if (!xfs_is_readonly(mp)) {
xfs_warn(mp,
"Attempted to mount read-only compatible filesystem read-write.");
xfs_warn(mp,
@@ -156,7 +242,7 @@ xfs_validate_sb_write(
* secondary superblocks, so allow this usage to continue because
* we never read counters from such superblocks.
*/
- if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
+ if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
(sbp->sb_fdblocks > sbp->sb_dblocks ||
!xfs_verify_icount(mp, sbp->sb_icount) ||
sbp->sb_ifree > sbp->sb_icount)) {
@@ -164,7 +250,7 @@ xfs_validate_sb_write(
return -EFSCORRUPTED;
}
- if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5)
+ if (!xfs_sb_is_v5(sbp))
return 0;
/*
@@ -220,49 +306,64 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
uint32_t agcount = 0;
uint32_t rem;
+ bool has_dalign;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+ be32_to_cpu(dsb->sb_magicnum));
return -EWRONGFS;
}
if (!xfs_sb_good_version(sbp)) {
- xfs_warn(mp, "bad version");
+ xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
return -EWRONGFS;
}
- if (xfs_sb_version_has_pquotino(sbp)) {
- if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+ /*
+ * Validate feature flags and state
+ */
+ if (xfs_sb_is_v5(sbp)) {
+ if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
xfs_notice(mp,
- "Version 5 of Super block has XFS_OQUOTA bits.");
+"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)",
+ sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE);
return -EFSCORRUPTED;
}
- } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
- XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+
+ /* V5 has a separate project quota inode */
+ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
xfs_notice(mp,
-"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
+ "Version 5 of Super block has XFS_OQUOTA bits.");
return -EFSCORRUPTED;
- }
+ }
- /*
- * Full inode chunks must be aligned to inode chunk size when
- * sparse inodes are enabled to support the sparse chunk
- * allocation algorithm and prevent overlapping inode records.
- */
- if (xfs_sb_version_hassparseinodes(sbp)) {
- uint32_t align;
+ /*
+ * Full inode chunks must be aligned to inode chunk size when
+ * sparse inodes are enabled to support the sparse chunk
+ * allocation algorithm and prevent overlapping inode records.
+ */
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) {
+ uint32_t align;
- align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
- >> sbp->sb_blocklog;
- if (sbp->sb_inoalignmt != align) {
- xfs_warn(mp,
+ align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+ >> sbp->sb_blocklog;
+ if (sbp->sb_inoalignmt != align) {
+ xfs_warn(mp,
"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
- sbp->sb_inoalignmt, align);
- return -EINVAL;
+ sbp->sb_inoalignmt, align);
+ return -EINVAL;
+ }
}
+ } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+ XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+ xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits.");
+ return -EFSCORRUPTED;
}
if (unlikely(
@@ -328,39 +429,52 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
- if (sbp->sb_unit) {
- if (!xfs_sb_version_hasdalign(sbp) ||
- sbp->sb_unit > sbp->sb_width ||
- (sbp->sb_width % sbp->sb_unit) != 0) {
- xfs_notice(mp, "SB stripe unit sanity check failed");
- return -EFSCORRUPTED;
- }
- } else if (xfs_sb_version_hasdalign(sbp)) {
- xfs_notice(mp, "SB stripe alignment sanity check failed");
- return -EFSCORRUPTED;
- } else if (sbp->sb_width) {
- xfs_notice(mp, "SB stripe width sanity check failed");
+ /* Validate the realtime geometry; stolen from xfs_repair */
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ xfs_notice(mp,
+ "realtime extent sanity check failed");
return -EFSCORRUPTED;
}
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+ xfs_notice(mp,
+ "realtime zeroed geometry check failed");
+ return -EFSCORRUPTED;
+ }
+ } else {
+ uint64_t rexts;
+ uint64_t rbmblocks;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
- xfs_notice(mp, "v5 SB sanity check failed");
- return -EFSCORRUPTED;
+ rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+ rbmblocks = howmany_64(sbp->sb_rextents,
+ NBBY * sbp->sb_blocksize);
+
+ if (sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != rbmblocks) {
+ xfs_notice(mp,
+ "realtime geometry sanity check failed");
+ return -EFSCORRUPTED;
+ }
}
/*
- * Until this is fixed only page-sized or smaller data blocks work.
+ * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign)
+ * would imply the image is corrupted.
*/
- if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
- xfs_warn(mp,
- "File system with blocksize %d bytes. "
- "Only pagesize (%ld) or less will currently work.",
- sbp->sb_blocksize, PAGE_SIZE);
- return -ENOSYS;
+ has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT;
+ if (!!sbp->sb_unit ^ has_dalign) {
+ xfs_notice(mp, "SB stripe alignment sanity check failed");
+ return -EFSCORRUPTED;
}
+ if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
+ XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+ return -EFSCORRUPTED;
+
/*
* Currently only very few inode sizes are supported.
*/
@@ -376,22 +490,6 @@ xfs_validate_sb_common(
return -ENOSYS;
}
- if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
- xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
- xfs_warn(mp,
- "file system too large to be mounted on this system.");
- return -EFBIG;
- }
-
- /*
- * Don't touch the filesystem if a user tool thinks it owns the primary
- * superblock. mkfs doesn't clear the flag from secondary supers, so
- * we don't check them at all.
- */
- if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && sbp->sb_inprogress) {
- xfs_warn(mp, "Offline file system operation in progress!");
- return -EFSCORRUPTED;
- }
return 0;
}
@@ -420,7 +518,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
* We need to do these manipilations only if we are working
* with an older version of on-disk superblock.
*/
- if (xfs_sb_version_has_pquotino(sbp))
+ if (xfs_sb_is_v5(sbp))
return;
if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
@@ -450,7 +548,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
static void
__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from,
+ struct xfs_dsb *from,
bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
@@ -513,7 +611,8 @@ __xfs_sb_from_disk(
* sb_meta_uuid is only on disk if it differs from sb_uuid and the
* feature flag is set; if not set we keep it only in memory.
*/
- if (xfs_sb_version_hasmetauuid(to))
+ if (xfs_sb_is_v5(to) &&
+ (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID))
uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
else
uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
@@ -525,7 +624,7 @@ __xfs_sb_from_disk(
void
xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ struct xfs_dsb *from)
{
__xfs_sb_from_disk(to, from, true);
}
@@ -538,7 +637,12 @@ xfs_sb_quota_to_disk(
uint16_t qflags = from->sb_qflags;
to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
- if (xfs_sb_version_has_pquotino(from)) {
+
+ /*
+ * The in-memory superblock quota state matches the v5 on-disk format so
+ * just write them out and return
+ */
+ if (xfs_sb_is_v5(from)) {
to->sb_qflags = cpu_to_be16(from->sb_qflags);
to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
@@ -546,9 +650,9 @@ xfs_sb_quota_to_disk(
}
/*
- * The in-core version of sb_qflags do not have XFS_OQUOTA_*
- * flags, whereas the on-disk version does. So, convert incore
- * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
+ * For older superblocks (v4), the in-core version of sb_qflags do not
+ * have XFS_OQUOTA_* flags, whereas the on-disk version does. So,
+ * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
*/
qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
@@ -568,7 +672,7 @@ xfs_sb_quota_to_disk(
* disk. If neither are active, we should NULL the inode.
*
* In all cases, the separate pquotino must remain 0 because it
- * it beyond the "end" of the valid non-pquotino superblock.
+ * is beyond the "end" of the valid non-pquotino superblock.
*/
if (from->sb_qflags & XFS_GQUOTA_ACCT)
to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
@@ -648,19 +752,20 @@ xfs_sb_to_disk(
to->sb_features2 = cpu_to_be32(from->sb_features2);
to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
- if (xfs_sb_version_hascrc(from)) {
- to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
- to->sb_features_ro_compat =
- cpu_to_be32(from->sb_features_ro_compat);
- to->sb_features_incompat =
- cpu_to_be32(from->sb_features_incompat);
- to->sb_features_log_incompat =
- cpu_to_be32(from->sb_features_log_incompat);
- to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
- to->sb_lsn = cpu_to_be64(from->sb_lsn);
- if (xfs_sb_version_hasmetauuid(from))
- uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
- }
+ if (!xfs_sb_is_v5(from))
+ return;
+
+ to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+ to->sb_features_ro_compat =
+ cpu_to_be32(from->sb_features_ro_compat);
+ to->sb_features_incompat =
+ cpu_to_be32(from->sb_features_incompat);
+ to->sb_features_log_incompat =
+ cpu_to_be32(from->sb_features_log_incompat);
+ to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
+ to->sb_lsn = cpu_to_be64(from->sb_lsn);
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
}
/*
@@ -681,7 +786,7 @@ xfs_sb_read_verify(
{
struct xfs_sb sb;
struct xfs_mount *mp = bp->b_mount;
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
int error;
/*
@@ -695,8 +800,8 @@ xfs_sb_read_verify(
if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
/* Only fail bad secondaries on a known V5 filesystem */
- if (bp->b_bn == XFS_SB_DADDR ||
- xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_buf_daddr(bp) == XFS_SB_DADDR ||
+ xfs_has_crc(mp)) {
error = -EFSBADCRC;
goto out_error;
}
@@ -707,7 +812,7 @@ xfs_sb_read_verify(
* Check all the superblock fields. Don't byteswap the xquota flags
* because _verify_common checks the on-disk values.
*/
- __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+ __xfs_sb_from_disk(&sb, dsb, false);
error = xfs_validate_sb_common(mp, bp, &sb);
if (error)
goto out_error;
@@ -730,7 +835,7 @@ static void
xfs_sb_quiet_read_verify(
struct xfs_buf *bp)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
/* XFS filesystem, verify noisily! */
@@ -748,13 +853,14 @@ xfs_sb_write_verify(
struct xfs_sb sb;
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dsb *dsb = bp->b_addr;
int error;
/*
* Check all the superblock fields. Don't byteswap the xquota flags
* because _verify_common checks the on-disk values.
*/
- __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+ __xfs_sb_from_disk(&sb, dsb, false);
error = xfs_validate_sb_common(mp, bp, &sb);
if (error)
goto out_error;
@@ -762,11 +868,11 @@ xfs_sb_write_verify(
if (error)
goto out_error;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_sb_is_v5(&sb))
return;
if (bip)
- XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
return;
@@ -839,78 +945,6 @@ xfs_sb_mount_common(
}
/*
- * xfs_initialize_perag_data
- *
- * Read in each per-ag structure so we can count up the number of
- * allocated inodes, free inodes and used filesystem blocks as this
- * information is no longer persistent in the superblock. Once we have
- * this information, write it into the in-core superblock structure.
- */
-int
-xfs_initialize_perag_data(
- struct xfs_mount *mp,
- xfs_agnumber_t agcount)
-{
- xfs_agnumber_t index;
- xfs_perag_t *pag;
- xfs_sb_t *sbp = &mp->m_sb;
- uint64_t ifree = 0;
- uint64_t ialloc = 0;
- uint64_t bfree = 0;
- uint64_t bfreelst = 0;
- uint64_t btree = 0;
- uint64_t fdblocks;
- int error = 0;
-
- for (index = 0; index < agcount; index++) {
- /*
- * read the agf, then the agi. This gets us
- * all the information we need and populates the
- * per-ag structures for us.
- */
- error = xfs_alloc_pagf_init(mp, NULL, index, 0);
- if (error)
- return error;
-
- error = xfs_ialloc_pagi_init(mp, NULL, index);
- if (error)
- return error;
- pag = xfs_perag_get(mp, index);
- ifree += pag->pagi_freecount;
- ialloc += pag->pagi_count;
- bfree += pag->pagf_freeblks;
- bfreelst += pag->pagf_flcount;
- btree += pag->pagf_btreeblks;
- xfs_perag_put(pag);
- }
- fdblocks = bfree + bfreelst + btree;
-
- /*
- * If the new summary counts are obviously incorrect, fail the
- * mount operation because that implies the AGFs are also corrupt.
- * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
- * will prevent xfs_repair from fixing anything.
- */
- if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
- xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
- error = -EFSCORRUPTED;
- goto out;
- }
-
- /* Overwrite incore superblock counters with just-read data */
- spin_lock(&mp->m_sb_lock);
- sbp->sb_ifree = ifree;
- sbp->sb_icount = ialloc;
- sbp->sb_fdblocks = fdblocks;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_reinit_percpu_counters(mp);
-out:
- xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
- return error;
-}
-
-/*
* xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
* into the superblock buffer to be logged. It does not provide the higher
* level of locking that is needed to protect the in-core superblock from
@@ -921,13 +955,28 @@ xfs_log_sb(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *bp = xfs_trans_getsb(tp, mp);
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ /*
+ * Lazy sb counters don't update the in-core superblock so do that now.
+ * If this is at unmount, the counters will be exactly correct, but at
+ * any other time they will only be ballpark correct because of
+ * reservations that have been taken out percpu counters. If we have an
+ * unclean shutdown, this will be corrected by log recovery rebuilding
+ * the counters from the AGF block counts.
+ *
+ * Do not update sb_frextents here because it is not part of the lazy
+ * sb counters, despite having a percpu counter. It is always kept
+ * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+ * and hence we don't need have to update it here.
+ */
+ if (xfs_has_lazysbcount(mp)) {
+ mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ }
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
}
@@ -976,17 +1025,18 @@ int
xfs_update_secondary_sbs(
struct xfs_mount *mp)
{
- xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = 1;
int saved_error = 0;
int error = 0;
LIST_HEAD (buffer_list);
/* update secondary superblocks. */
- for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
struct xfs_buf *bp;
error = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
XFS_FSS_TO_BB(mp, 1), &bp);
/*
* If we get an error reading or writing alternate superblocks,
@@ -998,7 +1048,7 @@ xfs_update_secondary_sbs(
if (error) {
xfs_warn(mp,
"error allocating secondary superblock for ag %d",
- agno);
+ pag->pag_agno);
if (!saved_error)
saved_error = error;
continue;
@@ -1007,7 +1057,7 @@ xfs_update_secondary_sbs(
bp->b_ops = &xfs_sb_buf_ops;
xfs_buf_oneshot(bp);
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_buf_delwri_queue(bp, &buffer_list);
xfs_buf_relse(bp);
@@ -1019,7 +1069,7 @@ xfs_update_secondary_sbs(
if (error) {
xfs_warn(mp,
"write error %d updating a secondary superblock near ag %d",
- error, agno);
+ error, pag->pag_agno);
if (!saved_error)
saved_error = error;
continue;
@@ -1051,7 +1101,7 @@ xfs_sync_sb_buf(
if (error)
return error;
- bp = xfs_trans_getsb(tp, mp);
+ bp = xfs_trans_getsb(tp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
xfs_trans_set_sync(tp);
@@ -1069,10 +1119,12 @@ out:
void
xfs_fs_geometry(
- struct xfs_sb *sbp,
+ struct xfs_mount *mp,
struct xfs_fsop_geom *geo,
int struct_version)
{
+ struct xfs_sb *sbp = &mp->m_sb;
+
memset(geo, 0, sizeof(struct xfs_fsop_geom));
geo->blocksize = sbp->sb_blocksize;
@@ -1103,47 +1155,53 @@ xfs_fs_geometry(
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
XFS_FSOP_GEOM_FLAGS_DIRV2 |
XFS_FSOP_GEOM_FLAGS_EXTFLG;
- if (xfs_sb_version_hasattr(sbp))
+ if (xfs_has_attr(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
- if (xfs_sb_version_hasquota(sbp))
+ if (xfs_has_quota(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA;
- if (xfs_sb_version_hasalign(sbp))
+ if (xfs_has_align(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
- if (xfs_sb_version_hasdalign(sbp))
+ if (xfs_has_dalign(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
- if (xfs_sb_version_hassector(sbp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
- if (xfs_sb_version_hasasciici(sbp))
+ if (xfs_has_asciici(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
- if (xfs_sb_version_haslazysbcount(sbp))
+ if (xfs_has_lazysbcount(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
- if (xfs_sb_version_hasattr2(sbp))
+ if (xfs_has_attr2(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
- if (xfs_sb_version_hasprojid32bit(sbp))
+ if (xfs_has_projid32(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
- if (xfs_sb_version_hascrc(sbp))
+ if (xfs_has_crc(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB;
- if (xfs_sb_version_hasftype(sbp))
+ if (xfs_has_ftype(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE;
- if (xfs_sb_version_hasfinobt(sbp))
+ if (xfs_has_finobt(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT;
- if (xfs_sb_version_hassparseinodes(sbp))
+ if (xfs_has_sparseinodes(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES;
- if (xfs_sb_version_hasrmapbt(sbp))
+ if (xfs_has_rmapbt(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
- if (xfs_sb_version_hasreflink(sbp))
+ if (xfs_has_reflink(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
- if (xfs_sb_version_hassector(sbp))
+ if (xfs_has_bigtime(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
+ if (xfs_has_inobtcounts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
+ if (xfs_has_sector(mp)) {
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
geo->logsectsize = sbp->sb_logsectsize;
- else
+ } else {
geo->logsectsize = BBSIZE;
+ }
+ if (xfs_has_large_extent_counts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
if (struct_version < 4)
return;
- if (xfs_sb_version_haslogv2(sbp))
+ if (xfs_has_logv2(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
geo->logsunit = sbp->sb_logsunit;
@@ -1198,3 +1256,61 @@ xfs_sb_get_secondary(
*bpp = bp;
return 0;
}
+
+/*
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
+ * so users won't be confused by values in error messages.
+ */
+bool
+xfs_validate_stripe_geometry(
+ struct xfs_mount *mp,
+ __s64 sunit,
+ __s64 swidth,
+ int sectorsize,
+ bool silent)
+{
+ if (swidth > INT_MAX) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe width (%lld) is too large", swidth);
+ return false;
+ }
+
+ if (sunit > swidth) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
+ return false;
+ }
+
+ if (sectorsize && (int)sunit % sectorsize) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe unit (%lld) must be a multiple of the sector size (%d)",
+ sunit, sectorsize);
+ return false;
+ }
+
+ if (sunit && !swidth) {
+ if (!silent)
+ xfs_notice(mp,
+"invalid stripe unit (%lld) and stripe width of 0", sunit);
+ return false;
+ }
+
+ if (!sunit && swidth) {
+ if (!silent)
+ xfs_notice(mp,
+"invalid stripe width (%lld) and stripe unit of 0", swidth);
+ return false;
+ }
+
+ if (sunit && (int)swidth % (int)sunit) {
+ if (!silent)
+ xfs_notice(mp,
+"stripe width (%lld) must be a multiple of the stripe unit (%lld)",
+ swidth, sunit);
+ return false;
+ }
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 92465a9a5162..a5e14740ec9a 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -13,15 +13,6 @@ struct xfs_trans;
struct xfs_fsop_geom;
struct xfs_perag;
-/*
- * perag get/put wrappers for ref counting
- */
-extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
-extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
- int tag);
-extern void xfs_perag_put(struct xfs_perag *pag);
-extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
extern int xfs_sync_sb_buf(struct xfs_mount *mp);
@@ -29,11 +20,13 @@ extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+extern bool xfs_sb_good_version(struct xfs_sb *sbp);
+extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp);
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
-extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
+extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
@@ -42,4 +35,7 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **bpp);
+extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
+ __s64 sunit, __s64 swidth, int sectorsize, bool silent);
+
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c45acbd3add9..c4381388c0c1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -22,30 +22,26 @@ struct xfs_inode;
* Buffer verifier operations are widely used, including userspace tools
*/
extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
-extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
-extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
-extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_finobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-extern const struct xfs_buf_ops xfs_rtbuf_ops;
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
@@ -58,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
/*
* Values for t_flags.
*/
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
+
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
@@ -175,6 +181,13 @@ struct xfs_ino_geometry {
unsigned int ialloc_align;
unsigned int agino_log; /* #bits for agino in inum */
+
+ /* precomputed default inode attribute fork offset */
+ unsigned int attr_fork_offset;
+
+ /* precomputed value for di_flags2 */
+ uint64_t new_diflags2;
+
};
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 3b8260ca7d1b..bdc777b9ec4a 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -42,7 +42,7 @@ xfs_symlink_hdr_set(
{
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return 0;
memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
@@ -51,7 +51,7 @@ xfs_symlink_hdr_set(
dsl->sl_bytes = cpu_to_be32(size);
uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
dsl->sl_owner = cpu_to_be64(ino);
- dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+ dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp));
bp->b_ops = &xfs_symlink_buf_ops;
return sizeof(struct xfs_dsymlink_hdr);
@@ -89,13 +89,13 @@ xfs_symlink_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return __this_address;
if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+ if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno))
return __this_address;
if (be32_to_cpu(dsl->sl_offset) +
be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN)
@@ -116,7 +116,7 @@ xfs_symlink_read_verify(
xfs_failaddr_t fa;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
@@ -137,7 +137,7 @@ xfs_symlink_write_verify(
xfs_failaddr_t fa;
/* no verification of non-crc buffers */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
return;
fa = xfs_symlink_verify(bp);
@@ -173,7 +173,7 @@ xfs_symlink_local_to_remote(
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
- if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_has_crc(mp)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
@@ -204,20 +204,16 @@ xfs_failaddr_t
xfs_symlink_shortform_verify(
struct xfs_inode *ip)
{
- char *sfp;
- char *endp;
- struct xfs_ifork *ifp;
- int size;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ char *sfp = (char *)ifp->if_u1.if_data;
+ int size = ifp->if_bytes;
+ char *endp = sfp + size;
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- sfp = (char *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
- endp = sfp + size;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
/*
* Zero length symlinks should never occur in memory as they are
- * never alllowed to exist on disk.
+ * never allowed to exist on disk.
*/
if (!size)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 2b8ccb5b975d..8b5547073379 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -27,7 +29,7 @@ xfs_trans_ijoin(
struct xfs_inode *ip,
uint lock_flags)
{
- xfs_inode_log_item_t *iip;
+ struct xfs_inode_log_item *iip;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (ip->i_itemp == NULL)
@@ -36,6 +38,7 @@ xfs_trans_ijoin(
ASSERT(iip->ili_lock_flags == 0);
iip->ili_lock_flags = lock_flags;
+ ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
/*
* Get a log_item_desc to point at the new item.
@@ -67,28 +70,39 @@ xfs_trans_ichgtime(
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
if (flags & XFS_ICHGTIME_CREATE)
- ip->i_d.di_crtime = tv;
+ ip->i_crtime = tv;
}
/*
- * This is called to mark the fields indicated in fieldmask as needing
- * to be logged when the transaction is committed. The inode must
- * already be associated with the given transaction.
+ * This is called to mark the fields indicated in fieldmask as needing to be
+ * logged when the transaction is committed. The inode must already be
+ * associated with the given transaction.
*
- * The values for fieldmask are defined in xfs_inode_item.h. We always
- * log all of the core inode if any of it has changed, and we always log
- * all of the inline data/extents/b-tree root if any of them has changed.
+ * The values for fieldmask are defined in xfs_inode_item.h. We always log all
+ * of the core inode if any of it has changed, and we always log all of the
+ * inline data/extents/b-tree root if any of them has changed.
+ *
+ * Grab and pin the cluster buffer associated with this inode to avoid RMW
+ * cycles at inode writeback time. Avoid the need to add error handling to every
+ * xfs_trans_log_inode() call by shutting down on read error. This will cause
+ * transactions to fail and everything to error out, just like if we return a
+ * read error in a dirty transaction and cancel it.
*/
void
xfs_trans_log_inode(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint flags)
{
- struct inode *inode = VFS_I(ip);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct inode *inode = VFS_I(ip);
+ uint iversion_flags = 0;
- ASSERT(ip->i_itemp != NULL);
+ ASSERT(iip);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
/*
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
@@ -96,22 +110,13 @@ xfs_trans_log_inode(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) {
+ if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
}
/*
- * Record the specific change for fdatasync optimisation. This
- * allows fdatasync to skip log forces for inodes that are only
- * timestamp dirty. We do this before the change count so that
- * the core being logged in this case does not impact on fdatasync
- * behaviour.
- */
- ip->i_itemp->ili_fsync_fields |= flags;
-
- /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. While we have the
* inode locked exclusively for metadata modification, we can usually
@@ -120,23 +125,89 @@ xfs_trans_log_inode(
* set however, then go ahead and bump the i_version counter
* unconditionally.
*/
- if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
- IS_I_VERSION(VFS_I(ip))) {
- if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
- flags |= XFS_ILOG_CORE;
+ if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
+ if (IS_I_VERSION(inode) &&
+ inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
+ iversion_flags = XFS_ILOG_CORE;
}
- tp->t_flags |= XFS_TRANS_DIRTY;
+ /*
+ * If we're updating the inode core or the timestamps and it's possible
+ * to upgrade this inode to bigtime format, do so now.
+ */
+ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+ xfs_has_bigtime(ip->i_mount) &&
+ !xfs_inode_has_bigtime(ip)) {
+ ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
+ * Inode verifiers do not check that the extent size hint is an integer
+ * multiple of the rt extent size on a directory with both rtinherit
+ * and extszinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the hint.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
+ * Record the specific change for fdatasync optimisation. This allows
+ * fdatasync to skip log forces for inodes that are only timestamp
+ * dirty.
+ */
+ spin_lock(&iip->ili_lock);
+ iip->ili_fsync_fields |= flags;
+
+ if (!iip->ili_item.li_buf) {
+ struct xfs_buf *bp;
+ int error;
+
+ /*
+ * We hold the ILOCK here, so this inode is not going to be
+ * flushed while we are here. Further, because there is no
+ * buffer attached to the item, we know that there is no IO in
+ * progress, so nothing will clear the ili_fields while we read
+ * in the buffer. Hence we can safely drop the spin lock and
+ * read the buffer knowing that the state will not change from
+ * here.
+ */
+ spin_unlock(&iip->ili_lock);
+ error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
+ if (error) {
+ xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ /*
+ * We need an explicit buffer reference for the log item but
+ * don't want the buffer to remain attached to the transaction.
+ * Hold the buffer but release the transaction reference once
+ * we've attached the inode log item to the buffer log item
+ * list.
+ */
+ xfs_buf_hold(bp);
+ spin_lock(&iip->ili_lock);
+ iip->ili_item.li_buf = bp;
+ bp->b_flags |= _XBF_INODES;
+ list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+ xfs_trans_brelse(tp, bp);
+ }
/*
- * Always OR in the bits from the ili_last_fields field.
- * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
- * routines in the eventual clearing of the ili_fields bits.
- * See the big comment in xfs_iflush() for an explanation of
- * this coordination mechanism.
+ * Always OR in the bits from the ili_last_fields field. This is to
+ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+ * in the eventual clearing of the ili_fields bits. See the big comment
+ * in xfs_iflush() for an explanation of this coordination mechanism.
*/
- flags |= ip->i_itemp->ili_last_fields;
- ip->i_itemp->ili_fields |= flags;
+ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
+ spin_unlock(&iip->ili_lock);
}
int
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 7a9c04920505..5b2f27cbdb80 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -56,30 +56,40 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). With reflink, there are four trees (refcountbt). The number of
- * blocks reserved is based on the formula:
+ * (rmapbt). The number of blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
uint
-xfs_allocfree_log_count(
+xfs_allocfree_block_count(
struct xfs_mount *mp,
uint num_ops)
{
uint blocks;
- blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
+ if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
- blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
/*
+ * Per-extent log reservation for refcount btree changes. These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -136,7 +146,7 @@ xfs_calc_inobt_res(
{
return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -155,7 +165,7 @@ STATIC uint
xfs_calc_finobt_res(
struct xfs_mount *mp)
{
- if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (!xfs_has_finobt(mp))
return 0;
return xfs_calc_inobt_res(mp);
@@ -183,11 +193,11 @@ xfs_calc_inode_chunk_res(
{
uint res, size = 0;
- res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_v3inodes(mp))
return res;
size = XFS_FSB_TO_B(mp, 1);
}
@@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res(
/*
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
- * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
- * as well as the realtime summary block.
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
*/
static unsigned int
-xfs_rtalloc_log_count(
+xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int rtbmp_bytes;
- rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
}
@@ -233,6 +243,28 @@ xfs_rtalloc_log_count(
* register overflow from temporaries in the calculations.
*/
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction. Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
/*
* In a write transaction we can allocate a maximum of 2
@@ -247,7 +279,7 @@ xfs_rtalloc_log_count(
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
- * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 1 block
* the allocation btrees: 2 trees * (2 * max depth - 1) * block size
* And the bmap_finish transaction can free bmap blocks in a join (t3):
@@ -255,34 +287,65 @@ xfs_rtalloc_log_count(
* the agfls of the ags containing the blocks: 2 * sector size
* the super block free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_write_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- if (xfs_sb_version_hasrealtime(&mp->m_sb)) {
+ if (xfs_has_realtime(mp)) {
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
} else {
t2 = 0;
}
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * two refcountbt splits for each transaction. The codebase runs
+ * refcountbt updates in separate transactions now, so to compute the
+ * minimum log size, add the refcountbtree splits back to t1 and t3 and
+ * do not account them separately as t4. Reflink did not support
+ * realtime when the reservations were established, so no adjustment to
+ * t2 is needed.
+ */
+ if (for_minlogsize) {
+ unsigned int adj = 0;
+
+ if (xfs_has_reflink(mp))
+ adj = xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 2),
+ blksz);
+ t1 += adj;
+ t3 += adj;
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 1);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp, true);
}
/*
@@ -299,38 +362,67 @@ xfs_calc_write_reservation(
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
- * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_itruncate_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
- if (xfs_sb_version_hasrealtime(&mp->m_sb)) {
+ if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * four refcountbt splits in the same transaction as bnobt/cntbt
+ * updates. The codebase runs refcountbt updates in separate
+ * transactions now, so to compute the minimum log size, add the
+ * refcount btree splits back here and do not compute them separately
+ * as t4. Reflink did not support realtime when the reservations were
+ * established, so do not adjust t3.
+ */
+ if (for_minlogsize) {
+ if (xfs_has_reflink(mp))
+ t2 += xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 4),
+ blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 2);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_itruncate_reservation(mp, true);
}
/*
* In renaming a files we can modify:
- * the four inodes involved: 4 * inode size
+ * the five inodes involved: 5 * inode size
* the two directory btrees: 2 * (max depth + v2) * dir block size
* the two directory bmap btrees: 2 * max depth * block size
* And the bmap_finish transaction can free dir and bmap blocks (two sets
@@ -345,11 +437,11 @@ xfs_calc_rename_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- max((xfs_calc_inode_res(mp, 4) +
+ max((xfs_calc_inode_res(mp, 5) +
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -389,7 +481,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -423,11 +515,11 @@ xfs_calc_remove_reservation(
{
return XFS_DQUOT_LOGRES(mp) +
xfs_calc_iunlink_add_reservation(mp) +
- max((xfs_calc_inode_res(mp, 1) +
+ max((xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -572,7 +664,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -670,7 +762,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -791,34 +883,19 @@ xfs_calc_qm_setqlim_reservation(void)
*/
STATIC uint
xfs_calc_qm_dqalloc_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- return xfs_calc_write_reservation(mp) +
+ return xfs_calc_write_reservation(mp, for_minlogsize) +
xfs_calc_buf_res(1,
XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
}
-/*
- * Turning off quotas.
- * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
- * the superblock for the quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_quotaoff_reservation(
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
struct xfs_mount *mp)
{
- return sizeof(struct xfs_qoff_logitem) * 2 +
- xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * End of turning off quotas.
- * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
- */
-STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(void)
-{
- return sizeof(struct xfs_qoff_logitem) * 2;
+ return xfs_calc_qm_dqalloc_reservation(mp, true);
}
/*
@@ -837,23 +914,18 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
+ int logcount_adj = 0;
+
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
*/
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
- resp->tr_itruncate.tr_logcount =
- XFS_ITRUNCATE_LOG_COUNT_REFLINK;
- else
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -909,11 +981,9 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- if (xfs_sb_version_hasreflink(&mp->m_sb))
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+ false);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
@@ -923,13 +993,6 @@ xfs_trans_resv_calc(
resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation();
resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
- resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
- resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
- resp->tr_qm_equotaoff.tr_logres =
- xfs_calc_qm_quotaoff_end_reservation();
- resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
-
resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
@@ -946,4 +1009,20 @@ xfs_trans_resv_calc(
resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+ /*
+ * Add one logcount for BUI items that appear with rmap or reflink,
+ * one logcount for refcount intent items, and one logcount for rmap
+ * intent items.
+ */
+ if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+ logcount_adj++;
+ if (xfs_has_reflink(mp))
+ logcount_adj++;
+ if (xfs_has_rmapbt(mp))
+ logcount_adj++;
+
+ resp->tr_itruncate.tr_logcount += logcount_adj;
+ resp->tr_write.tr_logcount += logcount_adj;
+ resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 7241ab28cf84..0554b9d775d2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -46,8 +46,6 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
- struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
- struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */
struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
};
@@ -75,7 +73,6 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
-#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -85,13 +82,24 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
-#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink. These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
+
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 88221c7a04cc..87b31c69a773 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -17,6 +17,13 @@
/* Adding one rmap could split every level up to the top of the tree. */
#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9
+
/* Blocks we might need to add "b" rmaps to a tree. */
#define XFS_NRMAPADD_SPACE_RES(mp, b)\
(((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
@@ -57,8 +64,7 @@
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
(M_IGEO(mp)->ialloc_blks + \
- (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
- (M_IGEO(mp)->inobt_maxlevels - 1)))
+ ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels))
/*
* Space reservation values for various transactions.
@@ -75,7 +81,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * (mp)->m_ag_maxlevels)
+ (2 * (mp)->m_alloc_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
@@ -94,8 +100,7 @@
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
#define XFS_IFREE_SPACE_RES(mp) \
- (xfs_sb_version_hasfinobt(&mp->m_sb) ? \
- M_IGEO(mp)->inobt_maxlevels : 0)
+ (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0)
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 4f595546a639..5c2765934732 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -11,26 +11,15 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
-/* Find the size of the AG, in blocks. */
-xfs_agblock_t
-xfs_ag_block_count(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- ASSERT(agno < mp->m_sb.sb_agcount);
-
- if (agno < mp->m_sb.sb_agcount - 1)
- return mp->m_sb.sb_agblocks;
- return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
-}
/*
* Verify that an AG block number pointer neither points outside the AG
* nor points at static metadata.
*/
-bool
-xfs_verify_agbno(
+static inline bool
+xfs_verify_agno_agbno(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agblock_t agbno)
@@ -49,7 +38,7 @@ xfs_verify_agbno(
* Verify that an FS block number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
-bool
+inline bool
xfs_verify_fsbno(
struct xfs_mount *mp,
xfs_fsblock_t fsbno)
@@ -58,43 +47,38 @@ xfs_verify_fsbno(
if (agno >= mp->m_sb.sb_agcount)
return false;
- return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+ return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
}
-/* Calculate the first and last possible inode number in an AG. */
-void
-xfs_agino_range(
+/*
+ * Verify that a data device extent is fully contained inside the filesystem,
+ * does not cross an AG boundary, and does not point at static metadata.
+ */
+bool
+xfs_verify_fsbext(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t *first,
- xfs_agino_t *last)
+ xfs_fsblock_t fsbno,
+ xfs_fsblock_t len)
{
- xfs_agblock_t bno;
- xfs_agblock_t eoag;
+ if (fsbno + len <= fsbno)
+ return false;
- eoag = xfs_ag_block_count(mp, agno);
+ if (!xfs_verify_fsbno(mp, fsbno))
+ return false;
- /*
- * Calculate the first inode, which will be in the first
- * cluster-aligned block after the AGFL.
- */
- bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
- *first = XFS_AGB_TO_AGINO(mp, bno);
-
- /*
- * Calculate the last inode, which will be at the end of the
- * last (aligned) cluster that can be allocated in the AG.
- */
- bno = round_down(eoag, M_IGEO(mp)->cluster_align);
- *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+ if (!xfs_verify_fsbno(mp, fsbno + len - 1))
+ return false;
+
+ return XFS_FSB_TO_AGNO(mp, fsbno) ==
+ XFS_FSB_TO_AGNO(mp, fsbno + len - 1);
}
/*
* Verify that an AG inode number pointer neither points outside the AG
* nor points at static metadata.
*/
-bool
-xfs_verify_agino(
+static inline bool
+xfs_verify_agno_agino(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agino_t agino)
@@ -107,23 +91,10 @@ xfs_verify_agino(
}
/*
- * Verify that an AG inode number pointer neither points outside the AG
- * nor points at static metadata, or is NULLAGINO.
- */
-bool
-xfs_verify_agino_or_null(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t agino)
-{
- return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
-}
-
-/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
-bool
+inline bool
xfs_verify_ino(
struct xfs_mount *mp,
xfs_ino_t ino)
@@ -135,17 +106,17 @@ xfs_verify_ino(
return false;
if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
return false;
- return xfs_verify_agino(mp, agno, agino);
+ return xfs_verify_agno_agino(mp, agno, agino);
}
/* Is this an internal inode number? */
-bool
+inline bool
xfs_internal_inum(
struct xfs_mount *mp,
xfs_ino_t ino)
{
return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
- (xfs_sb_version_hasquota(&mp->m_sb) &&
+ (xfs_has_quota(mp) &&
xfs_is_quota_inode(&mp->m_sb, ino));
}
@@ -167,7 +138,7 @@ xfs_verify_dir_ino(
* Verify that an realtime block number pointer doesn't point off the
* end of the realtime device.
*/
-bool
+inline bool
xfs_verify_rtbno(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
@@ -175,25 +146,38 @@ xfs_verify_rtbno(
return rtbno < mp->m_sb.sb_rblocks;
}
+/* Verify that a realtime device extent is fully contained inside the volume. */
+bool
+xfs_verify_rtext(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno,
+ xfs_rtblock_t len)
+{
+ if (rtbno + len <= rtbno)
+ return false;
+
+ if (!xfs_verify_rtbno(mp, rtbno))
+ return false;
+
+ return xfs_verify_rtbno(mp, rtbno + len - 1);
+}
+
/* Calculate the range of valid icount values. */
-void
+inline void
xfs_icount_range(
struct xfs_mount *mp,
unsigned long long *min,
unsigned long long *max)
{
unsigned long long nr_inos = 0;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
/* root, rtbitmap, rtsum all live in the first chunk */
*min = XFS_INODES_PER_CHUNK;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- xfs_agino_t first, last;
-
- xfs_agino_range(mp, agno, &first, &last);
- nr_inos += last - first + 1;
- }
+ for_each_perag(mp, agno, pag)
+ nr_inos += pag->agino_max - pag->agino_min + 1;
*max = nr_inos;
}
@@ -219,3 +203,28 @@ xfs_verify_dablk(
return dabno <= max_dablk;
}
+
+/* Check that a file block offset does not exceed the maximum. */
+bool
+xfs_verify_fileoff(
+ struct xfs_mount *mp,
+ xfs_fileoff_t off)
+{
+ return off <= XFS_MAX_FILEOFF;
+}
+
+/* Check that a range of file block offsets do not exceed the maximum. */
+bool
+xfs_verify_fileext(
+ struct xfs_mount *mp,
+ xfs_fileoff_t off,
+ xfs_fileoff_t len)
+{
+ if (off + len <= off)
+ return false;
+
+ if (!xfs_verify_fileoff(mp, off))
+ return false;
+
+ return xfs_verify_fileoff(mp, off + len - 1);
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 397d94775440..5ebdda7e1078 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
-typedef int32_t xfs_extnum_t; /* # of extents in a file */
-typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef uint64_t xfs_extnum_t; /* # of extents in a file */
+typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
@@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
+typedef int64_t xfs_csn_t; /* CIL sequence number */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
@@ -56,13 +57,6 @@ typedef void * xfs_failaddr_t;
#define NULLAGINO ((xfs_agino_t)-1)
/*
- * Max values for extlen, extnum, aextnum.
- */
-#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
-#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
-#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
-
-/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
@@ -86,6 +80,11 @@ typedef void * xfs_failaddr_t;
#define XFS_ATTR_FORK 1
#define XFS_COW_FORK 2
+#define XFS_WHICHFORK_STRINGS \
+ { XFS_DATA_FORK, "data" }, \
+ { XFS_ATTR_FORK, "attr" }, \
+ { XFS_COW_FORK, "cow" }
+
/*
* Min numbers of data/attr fork btree root pointers.
*/
@@ -167,6 +166,36 @@ typedef struct xfs_bmbt_irec
xfs_exntst_t br_state; /* extent state */
} xfs_bmbt_irec_t;
+enum xfs_refc_domain {
+ XFS_REFC_DOMAIN_SHARED = 0,
+ XFS_REFC_DOMAIN_COW,
+};
+
+#define XFS_REFC_DOMAIN_STRINGS \
+ { XFS_REFC_DOMAIN_SHARED, "shared" }, \
+ { XFS_REFC_DOMAIN_COW, "cow" }
+
+struct xfs_refcount_irec {
+ xfs_agblock_t rc_startblock; /* starting block number */
+ xfs_extlen_t rc_blockcount; /* count of free blocks */
+ xfs_nlink_t rc_refcount; /* number of inodes linked here */
+ enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */
+};
+
+#define XFS_RMAP_ATTR_FORK (1 << 0)
+#define XFS_RMAP_BMBT_BLOCK (1 << 1)
+#define XFS_RMAP_UNWRITTEN (1 << 2)
+#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
+ XFS_RMAP_BMBT_BLOCK)
+#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
+struct xfs_rmap_irec {
+ xfs_agblock_t rm_startblock; /* extent start block */
+ xfs_extlen_t rm_blockcount; /* extent length */
+ uint64_t rm_owner; /* extent owner */
+ uint64_t rm_offset; /* offset within the owner */
+ unsigned int rm_flags; /* state flags */
+};
+
/* per-AG block reservation types */
enum xfs_ag_resv_type {
XFS_AG_RESV_NONE = 0,
@@ -180,24 +209,22 @@ enum xfs_ag_resv_type {
*/
struct xfs_mount;
-xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
-bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno);
bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
+ xfs_fsblock_t len);
-void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t *first, xfs_agino_t *last);
-bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t agino);
-bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
+bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+ xfs_rtblock_t len);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
unsigned long long *max);
+bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
+bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
+ xfs_fileoff_t len);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index ba0f747c82e8..b7b838bd4ba4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -14,6 +14,7 @@
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -35,7 +36,7 @@ xchk_superblock_xref(
agbno = XFS_SB_BLOCK(mp);
- error = xchk_ag_init(sc, agno, &sc->sa);
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
if (!xchk_xref_process_error(sc, agno, agbno, &error))
return;
@@ -62,6 +63,7 @@ xchk_superblock(
struct xfs_mount *mp = sc->mp;
struct xfs_buf *bp;
struct xfs_dsb *sb;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
uint32_t v2_ok;
__be32 features_mask;
@@ -72,6 +74,15 @@ xchk_superblock(
if (agno == 0)
return 0;
+ /*
+ * Grab an active reference to the perag structure. If we can't get
+ * it, we're racing with something that's tearing down the AG, so
+ * signal that the AG no longer exists.
+ */
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return -ENOENT;
+
error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp);
/*
* The superblock verifier can return several different error codes
@@ -86,13 +97,14 @@ xchk_superblock(
case -ENOSYS:
case -EFBIG:
error = -EFSCORRUPTED;
+ fallthrough;
default:
break;
}
if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
- return error;
+ goto out_pag;
- sb = XFS_BUF_TO_SBP(bp);
+ sb = bp->b_addr;
/*
* Verify the geometries match. Fields that are permanently
@@ -246,7 +258,7 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
} else {
v2_ok = XFS_SB_VERSION2_OKBITS;
- if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5)
+ if (xfs_sb_is_v5(&mp->m_sb))
v2_ok |= XFS_SB_VERSION2_CRCBIT;
if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
@@ -269,48 +281,47 @@ xchk_superblock(
features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT);
if ((sb->sb_features2 & features_mask) !=
(cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
- xchk_block_set_corrupt(sc, bp);
+ xchk_block_set_preen(sc, bp);
- if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_has_crc(mp)) {
/* all v5 fields must be zero */
if (memchr_inv(&sb->sb_features_compat, 0,
sizeof(struct xfs_dsb) -
offsetof(struct xfs_dsb, sb_features_compat)))
xchk_block_set_corrupt(sc, bp);
} else {
- /* Check compat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN);
- if ((sb->sb_features_compat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask))
+ /* compat features must match */
+ if (sb->sb_features_compat !=
+ cpu_to_be32(mp->m_sb.sb_features_compat))
xchk_block_set_corrupt(sc, bp);
- /* Check ro compat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN |
- XFS_SB_FEAT_RO_COMPAT_FINOBT |
- XFS_SB_FEAT_RO_COMPAT_RMAPBT |
- XFS_SB_FEAT_RO_COMPAT_REFLINK);
- if ((sb->sb_features_ro_compat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_ro_compat) &
- features_mask))
+ /* ro compat features must match */
+ if (sb->sb_features_ro_compat !=
+ cpu_to_be32(mp->m_sb.sb_features_ro_compat))
xchk_block_set_corrupt(sc, bp);
- /* Check incompat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN |
- XFS_SB_FEAT_INCOMPAT_FTYPE |
- XFS_SB_FEAT_INCOMPAT_SPINODES |
- XFS_SB_FEAT_INCOMPAT_META_UUID);
- if ((sb->sb_features_incompat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_incompat) &
- features_mask))
- xchk_block_set_corrupt(sc, bp);
+ /*
+ * NEEDSREPAIR is ignored on a secondary super, so we should
+ * clear it when we find it, though it's not a corruption.
+ */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+ if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^
+ sb->sb_features_incompat) & features_mask)
+ xchk_block_set_preen(sc, bp);
- /* Check log incompat flags; all are set at mkfs time. */
- features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN);
- if ((sb->sb_features_log_incompat & features_mask) !=
- (cpu_to_be32(mp->m_sb.sb_features_log_incompat) &
- features_mask))
+ /* all other incompat features must match */
+ if ((cpu_to_be32(mp->m_sb.sb_features_incompat) ^
+ sb->sb_features_incompat) & ~features_mask)
xchk_block_set_corrupt(sc, bp);
+ /*
+ * log incompat features protect newer log record types from
+ * older log recovery code. Log recovery doesn't check the
+ * secondary supers, so we can clear these if needed.
+ */
+ if (sb->sb_features_log_incompat)
+ xchk_block_set_preen(sc, bp);
+
/* Don't care about sb_crc */
if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
@@ -322,7 +333,7 @@ xchk_superblock(
/* Don't care about sb_lsn */
}
- if (xfs_sb_version_hasmetauuid(&mp->m_sb)) {
+ if (xfs_has_metauuid(mp)) {
/* The metadata UUID must be the same for all supers */
if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid))
xchk_block_set_corrupt(sc, bp);
@@ -334,7 +345,8 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
xchk_superblock_xref(sc, bp);
-
+out_pag:
+ xfs_perag_put(pag);
return error;
}
@@ -344,7 +356,7 @@ xchk_superblock(
STATIC int
xchk_agf_record_bno_lengths(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
+ const struct xfs_alloc_rec_incore *rec,
void *priv)
{
xfs_extlen_t *blocks = priv;
@@ -358,7 +370,7 @@ static inline void
xchk_agf_xref_freeblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_extlen_t blocks = 0;
int error;
@@ -378,7 +390,7 @@ static inline void
xchk_agf_xref_cntbt(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_agblock_t agbno;
xfs_extlen_t blocks;
int have;
@@ -410,12 +422,16 @@ STATIC void
xchk_agf_xref_btreeblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t blocks;
xfs_agblock_t btreeblks;
int error;
+ /* agf_btreeblks didn't exist before lazysbcount */
+ if (!xfs_has_lazysbcount(sc->mp))
+ return;
+
/* Check agf_rmap_blocks; set up for agf_btreeblks check */
if (sc->sa.rmap_cur) {
error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks);
@@ -432,7 +448,7 @@ xchk_agf_xref_btreeblks(
* No rmap cursor; we can't xref if we have the rmapbt feature.
* We also can't do it if we're missing the free space btree cursors.
*/
- if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) ||
+ if ((xfs_has_rmapbt(mp) && !sc->sa.rmap_cur) ||
!sc->sa.bno_cur || !sc->sa.cnt_cur)
return;
@@ -456,7 +472,7 @@ static inline void
xchk_agf_xref_refcblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_agblock_t blocks;
int error;
@@ -477,16 +493,13 @@ xchk_agf_xref(
{
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
- int error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
agbno = XFS_AGF_BLOCK(mp);
- error = xchk_ag_btcur_init(sc, &sc->sa);
- if (error)
- return;
+ xchk_ag_btcur_init(sc, &sc->sa);
xchk_xref_is_used_space(sc, agbno, 1);
xchk_agf_xref_freeblks(sc);
@@ -508,7 +521,7 @@ xchk_agf(
struct xfs_mount *mp = sc->mp;
struct xfs_agf *agf;
struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
xfs_agblock_t agfl_first;
@@ -518,54 +531,53 @@ xchk_agf(
int level;
int error = 0;
- agno = sc->sa.agno = sc->sm->sm_agno;
- error = xchk_ag_read_headers(sc, agno, &sc->sa.agi_bp,
- &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ error = xchk_ag_read_headers(sc, agno, &sc->sa);
if (!xchk_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error))
goto out;
xchk_buffer_recheck(sc, sc->sa.agf_bp);
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agf = sc->sa.agf_bp->b_addr;
+ pag = sc->sa.pag;
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
- if (eoag != xfs_ag_block_count(mp, agno))
+ if (eoag != pag->block_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_alloc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_rmap_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
agbno = be32_to_cpu(agf->agf_refcount_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_refcount_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > mp->m_refc_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
}
@@ -581,14 +593,13 @@ xchk_agf(
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Do the incore counters match? */
- pag = xfs_perag_get(mp, agno);
if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+ if (xfs_has_lazysbcount(sc->mp) &&
+ pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
- xfs_perag_put(pag);
xchk_agf_xref(sc);
out:
@@ -628,9 +639,8 @@ xchk_agfl_block(
{
struct xchk_agfl_info *sai = priv;
struct xfs_scrub *sc = sai->sc;
- xfs_agnumber_t agno = sc->sa.agno;
- if (xfs_verify_agbno(mp, agno, agbno) &&
+ if (xfs_verify_agbno(sc->sa.pag, agbno) &&
sai->nr_entries < sai->sz_entries)
sai->entries[sai->nr_entries++] = agbno;
else
@@ -662,16 +672,13 @@ xchk_agfl_xref(
{
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
- int error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
agbno = XFS_AGFL_BLOCK(mp);
- error = xchk_ag_btcur_init(sc, &sc->sa);
- if (error)
- return;
+ xchk_ag_btcur_init(sc, &sc->sa);
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
@@ -691,14 +698,12 @@ xchk_agfl(
{
struct xchk_agfl_info sai;
struct xfs_agf *agf;
- xfs_agnumber_t agno;
+ xfs_agnumber_t agno = sc->sm->sm_agno;
unsigned int agflcount;
unsigned int i;
int error;
- agno = sc->sa.agno = sc->sm->sm_agno;
- error = xchk_ag_read_headers(sc, agno, &sc->sa.agi_bp,
- &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ error = xchk_ag_read_headers(sc, agno, &sc->sa);
if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
goto out;
if (!sc->sa.agf_bp)
@@ -711,7 +716,7 @@ xchk_agfl(
goto out;
/* Allocate buffer to ensure uniqueness of AGFL entries. */
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agf = sc->sa.agf_bp->b_addr;
agflcount = be32_to_cpu(agf->agf_flcount);
if (agflcount > xfs_agfl_size(sc->mp)) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
@@ -728,7 +733,7 @@ xchk_agfl(
}
/* Check the blocks in the AGFL. */
- error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
sc->sa.agfl_bp, xchk_agfl_block, &sai);
if (error == -ECANCELED) {
error = 0;
@@ -765,7 +770,7 @@ static inline void
xchk_agi_xref_icounts(
struct xfs_scrub *sc)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
xfs_agino_t icount;
xfs_agino_t freecount;
int error;
@@ -781,6 +786,35 @@ xchk_agi_xref_icounts(
xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
}
+/* Check agi_[fi]blocks against tree size */
+static inline void
+xchk_agi_xref_fiblocks(
+ struct xfs_scrub *sc)
+{
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agblock_t blocks;
+ int error = 0;
+
+ if (!xfs_has_inobtcounts(sc->mp))
+ return;
+
+ if (sc->sa.ino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_iblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (sc->sa.fino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_fblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+}
+
/* Cross-reference with the other btrees. */
STATIC void
xchk_agi_xref(
@@ -788,22 +822,20 @@ xchk_agi_xref(
{
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
- int error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
agbno = XFS_AGI_BLOCK(mp);
- error = xchk_ag_btcur_init(sc, &sc->sa);
- if (error)
- return;
+ xchk_ag_btcur_init(sc, &sc->sa);
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
xchk_agi_xref_icounts(sc);
xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_agi_xref_fiblocks(sc);
/* scrub teardown will take care of sc->sa for us */
}
@@ -816,7 +848,8 @@ xchk_agi(
struct xfs_mount *mp = sc->mp;
struct xfs_agi *agi;
struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_ino_geometry *igeo = M_IGEO(sc->mp);
+ xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
xfs_agino_t agino;
@@ -827,36 +860,35 @@ xchk_agi(
int level;
int error = 0;
- agno = sc->sa.agno = sc->sm->sm_agno;
- error = xchk_ag_read_headers(sc, agno, &sc->sa.agi_bp,
- &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ error = xchk_ag_read_headers(sc, agno, &sc->sa);
if (!xchk_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error))
goto out;
xchk_buffer_recheck(sc, sc->sa.agi_bp);
- agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ agi = sc->sa.agi_bp->b_addr;
+ pag = sc->sa.pag;
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
- if (eoag != xfs_ag_block_count(mp, agno))
+ if (eoag != pag->block_count)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check btree roots and levels */
agbno = be32_to_cpu(agi->agi_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (xfs_has_finobt(mp)) {
agbno = be32_to_cpu(agi->agi_free_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_free_level);
- if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ if (level <= 0 || level > igeo->inobt_maxlevels)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
@@ -869,17 +901,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
@@ -887,12 +919,10 @@ xchk_agi(
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Do the incore counters match? */
- pag = xfs_perag_get(mp, agno);
if (pag->pagi_count != be32_to_cpu(agi->agi_count))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
- xfs_perag_put(pag);
xchk_agi_xref(sc);
out:
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index d5e6db9af434..1b0b4e243f77 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -20,6 +20,7 @@
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -49,7 +50,19 @@ xrep_superblock(
/* Copy AG 0's superblock to this one. */
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+
+ /*
+ * Don't write out a secondary super with NEEDSREPAIR or log incompat
+ * features set, since both are ignored when set on a secondary.
+ */
+ if (xfs_has_crc(mp)) {
+ struct xfs_dsb *sb = bp->b_addr;
+
+ sb->sb_features_incompat &=
+ ~cpu_to_be32(XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR);
+ sb->sb_features_log_incompat = 0;
+ }
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
@@ -69,7 +82,7 @@ struct xrep_agf_allocbt {
STATIC int
xrep_agf_walk_allocbt(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
+ const struct xfs_alloc_rec_incore *rec,
void *priv)
{
struct xrep_agf_allocbt *raa = priv;
@@ -93,7 +106,7 @@ xrep_agf_check_agfl_block(
{
struct xfs_scrub *sc = priv;
- if (!xfs_verify_agbno(mp, sc->sa.agno, agbno))
+ if (!xfs_verify_agbno(sc->sa.pag, agbno))
return -EFSCORRUPTED;
return 0;
}
@@ -117,11 +130,8 @@ xrep_check_btree_root(
struct xfs_scrub *sc,
struct xrep_find_ag_btree *fab)
{
- struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sc->sm->sm_agno;
-
- return xfs_verify_agbno(mp, agno, fab->root) &&
- fab->height <= XFS_BTREE_MAXLEVELS;
+ return xfs_verify_agbno(sc->sa.pag, fab->root) &&
+ fab->height <= fab->maxlevels;
}
/*
@@ -140,7 +150,7 @@ xrep_agf_find_btrees(
struct xrep_find_ag_btree *fab,
struct xfs_buf *agfl_bp)
{
- struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *old_agf = agf_bp->b_addr;
int error;
/* Go find the root data. */
@@ -163,7 +173,7 @@ xrep_agf_find_btrees(
return -EFSCORRUPTED;
/* We must find the refcountbt root if that feature is enabled. */
- if (xfs_sb_version_hasreflink(&sc->mp->m_sb) &&
+ if (xfs_has_reflink(sc->mp) &&
!xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT]))
return -EFSCORRUPTED;
@@ -181,18 +191,18 @@ xrep_agf_init_header(
struct xfs_agf *old_agf)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
memcpy(old_agf, agf, sizeof(*old_agf));
memset(agf, 0, BBTOB(agf_bp->b_length));
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(sc->sa.agno);
- agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+ agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+ agf->agf_length = cpu_to_be32(sc->sa.pag->block_count);
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
/* Mark the incore AGF data stale until we're done fixing things. */
@@ -222,7 +232,7 @@ xrep_agf_set_roots(
agf->agf_levels[XFS_BTNUM_RMAPi] =
cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
- if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+ if (xfs_has_reflink(sc->mp)) {
agf->agf_refcount_root =
cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
agf->agf_refcount_level =
@@ -238,15 +248,15 @@ xrep_agf_calc_from_btrees(
{
struct xrep_agf_allocbt raa = { .sc = sc };
struct xfs_btree_cur *cur = NULL;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t btreeblks;
xfs_agblock_t blocks;
int error;
/* Update the AGF counters from the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_BNO);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
if (error)
goto err;
@@ -259,8 +269,8 @@ xrep_agf_calc_from_btrees(
agf->agf_longest = cpu_to_be32(raa.longest);
/* Update the AGF counters from the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_CNT);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -268,7 +278,7 @@ xrep_agf_calc_from_btrees(
btreeblks += blocks - 1;
/* Update the AGF counters from the rmapbt. */
- cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -279,9 +289,9 @@ xrep_agf_calc_from_btrees(
agf->agf_btreeblks = cpu_to_be32(btreeblks);
/* Update the AGF counters from the refcountbt. */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
- sc->sa.agno);
+ sc->sa.pag);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -302,7 +312,7 @@ xrep_agf_commit_new(
struct xfs_buf *agf_bp)
{
struct xfs_perag *pag;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
/* Trigger fdblocks recalculation */
xfs_force_summary_recalc(sc->mp);
@@ -337,18 +347,22 @@ xrep_agf(
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_bnobt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_cntbt_buf_ops,
+ .maxlevels = sc->mp->m_alloc_maxlevels,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
+ .maxlevels = sc->mp->m_rmap_maxlevels,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
+ .maxlevels = sc->mp->m_refc_maxlevels,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -362,21 +376,21 @@ xrep_agf(
int error;
/* We require the rmapbt to rebuild anything. */
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xchk_perag_get(sc->mp, &sc->sa);
/*
* Make sure we have the AGF buffer, as scrub might have decided it
* was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
+ XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
if (error)
return error;
agf_bp->b_ops = &xfs_agf_buf_ops;
- agf = XFS_BUF_TO_AGF(agf_bp);
+ agf = agf_bp->b_addr;
/*
* Load the AGFL so that we can screen out OWN_AG blocks that are on
@@ -387,7 +401,7 @@ xrep_agf(
* btrees rooted in the AGF. If the AGFL contents are obviously bad
* then we'll bail out.
*/
- error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
if (error)
return error;
@@ -395,7 +409,7 @@ xrep_agf(
* Spot-check the AGFL blocks; if they're obviously corrupt then
* there's nothing we can do but bail out.
*/
- error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
+ error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
xrep_agf_check_agfl_block, sc);
if (error)
return error;
@@ -429,10 +443,10 @@ out_revert:
struct xrep_agfl {
/* Bitmap of other OWN_AG metadata blocks. */
- struct xfs_bitmap agmetablocks;
+ struct xbitmap agmetablocks;
/* Bitmap of free space. */
- struct xfs_bitmap *freesp;
+ struct xbitmap *freesp;
struct xfs_scrub *sc;
};
@@ -441,7 +455,7 @@ struct xrep_agfl {
STATIC int
xrep_agfl_walk_rmap(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_agfl *ra = priv;
@@ -453,14 +467,14 @@ xrep_agfl_walk_rmap(
/* Record all the OWN_AG blocks. */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
- fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
rec->rm_startblock);
- error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+ error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
if (error)
return error;
}
- return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur);
+ return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
}
/*
@@ -476,39 +490,37 @@ STATIC int
xrep_agfl_collect_blocks(
struct xfs_scrub *sc,
struct xfs_buf *agf_bp,
- struct xfs_bitmap *agfl_extents,
+ struct xbitmap *agfl_extents,
xfs_agblock_t *flcount)
{
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *n;
int error;
ra.sc = sc;
ra.freesp = agfl_extents;
- xfs_bitmap_init(&ra.agmetablocks);
+ xbitmap_init(&ra.agmetablocks);
/* Find all space used by the free space btrees & rmapbt. */
- cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
if (error)
goto err;
xfs_btree_del_cursor(cur, error);
/* Find all blocks currently being used by the bnobt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_BNO);
- error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
+ error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
xfs_btree_del_cursor(cur, error);
/* Find all blocks currently being used by the cntbt. */
- cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
- XFS_BTNUM_CNT);
- error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur);
+ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
+ error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
@@ -518,8 +530,8 @@ xrep_agfl_collect_blocks(
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
- error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks);
- xfs_bitmap_destroy(&ra.agmetablocks);
+ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+ xbitmap_destroy(&ra.agmetablocks);
if (error)
return error;
@@ -527,18 +539,12 @@ xrep_agfl_collect_blocks(
* Calculate the new AGFL size. If we found more blocks than fit in
* the AGFL we'll free them later.
*/
- *flcount = 0;
- for_each_xfs_bitmap_extent(br, n, agfl_extents) {
- *flcount += br->len;
- if (*flcount > xfs_agfl_size(mp))
- break;
- }
- if (*flcount > xfs_agfl_size(mp))
- *flcount = xfs_agfl_size(mp);
+ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+ xfs_agfl_size(mp));
return 0;
err:
- xfs_bitmap_destroy(&ra.agmetablocks);
+ xbitmap_destroy(&ra.agmetablocks);
xfs_btree_del_cursor(cur, error);
return error;
}
@@ -550,7 +556,7 @@ xrep_agfl_update_agf(
struct xfs_buf *agf_bp,
xfs_agblock_t flcount)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
ASSERT(flcount <= xfs_agfl_size(sc->mp));
@@ -573,13 +579,13 @@ STATIC void
xrep_agfl_init_header(
struct xfs_scrub *sc,
struct xfs_buf *agfl_bp,
- struct xfs_bitmap *agfl_extents,
+ struct xbitmap *agfl_extents,
xfs_agblock_t flcount)
{
struct xfs_mount *mp = sc->mp;
__be32 *agfl_bno;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *br;
+ struct xbitmap_range *n;
struct xfs_agfl *agfl;
xfs_agblock_t agbno;
unsigned int fl_off;
@@ -593,7 +599,7 @@ xrep_agfl_init_header(
agfl = XFS_BUF_TO_AGFL(agfl_bp);
memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
- agfl->agfl_seqno = cpu_to_be32(sc->sa.agno);
+ agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
/*
@@ -602,11 +608,12 @@ xrep_agfl_init_header(
* step.
*/
fl_off = 0;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
- for_each_xfs_bitmap_extent(br, n, agfl_extents) {
+ agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
+ for_each_xbitmap_extent(br, n, agfl_extents) {
agbno = XFS_FSB_TO_AGBNO(mp, br->start);
- trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len);
+ trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno,
+ br->len);
while (br->len > 0 && fl_off < flcount) {
agfl_bno[fl_off] = cpu_to_be32(agbno);
@@ -637,7 +644,7 @@ int
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xfs_bitmap agfl_extents;
+ struct xbitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
struct xfs_buf *agfl_bp;
@@ -645,18 +652,17 @@ xrep_agfl(
int error;
/* We require the rmapbt to rebuild anything. */
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xchk_perag_get(sc->mp, &sc->sa);
- xfs_bitmap_init(&agfl_extents);
+ xbitmap_init(&agfl_extents);
/*
* Read the AGF so that we can query the rmapbt. We hope that there's
* nothing wrong with the AGF, but all the AG header repair functions
* have this chicken-and-egg problem.
*/
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
@@ -665,7 +671,8 @@ xrep_agfl(
* was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGFL_DADDR(mp)),
+ XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
if (error)
return error;
@@ -696,10 +703,10 @@ xrep_agfl(
goto err;
/* Dump any AGFL overflow. */
- return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
err:
- xfs_bitmap_destroy(&agfl_extents);
+ xbitmap_destroy(&agfl_extents);
return error;
}
@@ -730,7 +737,7 @@ xrep_agi_find_btrees(
int error;
/* Read the AGF. */
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
@@ -744,7 +751,7 @@ xrep_agi_find_btrees(
return -EFSCORRUPTED;
/* We must find the finobt root if that feature is enabled. */
- if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ if (xfs_has_finobt(mp) &&
!xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT]))
return -EFSCORRUPTED;
@@ -761,18 +768,18 @@ xrep_agi_init_header(
struct xfs_buf *agi_bp,
struct xfs_agi *old_agi)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
memcpy(old_agi, agi, sizeof(*old_agi));
memset(agi, 0, BBTOB(agi_bp->b_length));
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(sc->sa.agno);
- agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+ agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+ agi->agi_length = cpu_to_be32(sc->sa.pag->block_count);
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_has_crc(mp))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
/* We don't know how to fix the unlinked list yet. */
@@ -794,7 +801,7 @@ xrep_agi_set_roots(
agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root);
agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height);
- if (xfs_sb_version_hasfinobt(&sc->mp->m_sb)) {
+ if (xfs_has_finobt(sc->mp)) {
agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root);
agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height);
}
@@ -807,21 +814,42 @@ xrep_agi_calc_from_btrees(
struct xfs_buf *agi_bp)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agino_t count;
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
- XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
+ sc->sa.pag, XFS_BTNUM_INO);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
+ if (xfs_has_inobtcounts(mp)) {
+ xfs_agblock_t blocks;
+
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ agi->agi_iblocks = cpu_to_be32(blocks);
+ }
xfs_btree_del_cursor(cur, error);
agi->agi_count = cpu_to_be32(count);
agi->agi_freecount = cpu_to_be32(freecount);
+
+ if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
+ xfs_agblock_t blocks;
+
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
+ sc->sa.pag, XFS_BTNUM_FINO);
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, error);
+ agi->agi_fblocks = cpu_to_be32(blocks);
+ }
+
return 0;
err:
xfs_btree_del_cursor(cur, error);
@@ -835,7 +863,7 @@ xrep_agi_commit_new(
struct xfs_buf *agi_bp)
{
struct xfs_perag *pag;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
/* Trigger inode count recalculation */
xfs_force_summary_recalc(sc->mp);
@@ -862,10 +890,12 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_finobt_buf_ops,
+ .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels,
},
[XREP_AGI_END] = {
.buf_ops = NULL
@@ -878,21 +908,21 @@ xrep_agi(
int error;
/* We require the rmapbt to rebuild anything. */
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xchk_perag_get(sc->mp, &sc->sa);
/*
* Make sure we have the AGI buffer, as scrub might have decided it
* was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGI_DADDR(mp)),
+ XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL);
if (error)
return error;
agi_bp->b_ops = &xfs_agi_buf_ops;
- agi = XFS_BUF_TO_AGI(agi_bp);
+ agi = agi_bp->b_addr;
/* Find the AGI btree roots. */
error = xrep_agi_find_btrees(sc, fab);
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 5533e48e605d..3b38f4e2a537 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -15,16 +15,16 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub free space btrees.
*/
int
xchk_setup_ag_allocbt(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- return xchk_setup_ag_btree(sc, ip, false);
+ return xchk_setup_ag_btree(sc, false);
}
/* Free space btree scrubber. */
@@ -91,19 +91,16 @@ xchk_allocbt_xref(
STATIC int
xchk_allocbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
xfs_extlen_t len;
bno = be32_to_cpu(rec->alloc.ar_startblock);
len = be32_to_cpu(rec->alloc.ar_blockcount);
- if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ if (!xfs_verify_agbext(pag, bno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_allocbt_xref(bs->sc, bno, len);
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index d9f0dd444b80..b6f0c9f3f124 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -25,11 +25,11 @@
* reallocating the buffer if necessary. Buffer contents are not preserved
* across a reallocation.
*/
-int
+static int
xchk_setup_xattr_buf(
struct xfs_scrub *sc,
size_t value_size,
- xfs_km_flags_t flags)
+ gfp_t flags)
{
size_t sz;
struct xchk_xattr_buf *ab = sc->buf;
@@ -57,7 +57,7 @@ xchk_setup_xattr_buf(
* Don't zero the buffer upon allocation to avoid runtime overhead.
* All users must be careful never to read uninitialized contents.
*/
- ab = kmem_alloc_large(sizeof(*ab) + sz, flags);
+ ab = kvmalloc(sizeof(*ab) + sz, flags);
if (!ab)
return -ENOMEM;
@@ -69,8 +69,7 @@ xchk_setup_xattr_buf(
/* Set us up to scrub an inode's extended attributes. */
int
xchk_setup_xattr(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
int error;
@@ -80,12 +79,12 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
if (error)
return error;
}
- return xchk_setup_inode_contents(sc, ip, 0);
+ return xchk_setup_inode_contents(sc, 0);
}
/* Extended Attributes */
@@ -98,7 +97,7 @@ struct xchk_xattr {
/*
* Check that an extended attribute key can be looked up by hash.
*
- * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked)
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
* to call this function for every attribute key in an inode. Once
* we're here, we load the attribute value to see if any errors happen,
* or if we get more or less data than we expected.
@@ -139,7 +138,8 @@ xchk_xattr_listent(
* doesn't work, we overload the seen_enough variable to convey
* the error message back to the main scrub function.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL);
+ error = xchk_setup_xattr_buf(sx->sc, valuelen,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error == -ENOMEM)
error = -EDEADLOCK;
if (error) {
@@ -147,11 +147,8 @@ xchk_xattr_listent(
return;
}
- args.flags = ATTR_KERNOTIME;
- if (flags & XFS_ATTR_ROOT)
- args.flags |= ATTR_ROOT;
- else if (flags & XFS_ATTR_SECURE)
- args.flags |= ATTR_SECURE;
+ args.op_flags = XFS_DA_OP_NOTIME;
+ args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK;
args.geo = context->dp->i_mount->m_attr_geo;
args.whichfork = XFS_ATTR_FORK;
args.dp = context->dp;
@@ -162,7 +159,10 @@ xchk_xattr_listent(
args.value = xchk_xattr_valuebuf(sx->sc);
args.valuelen = valuelen;
- error = xfs_attr_get_ilocked(context->dp, &args);
+ error = xfs_attr_get_ilocked(&args);
+ /* ENODATA means the hash lookup failed and the attr is bad */
+ if (error == -ENODATA)
+ error = -EFSCORRUPTED;
if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
&error))
goto fail_xref;
@@ -324,7 +324,8 @@ xchk_xattr_block(
return 0;
/* Allocate memory for block usage checking. */
- error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL);
+ error = xchk_setup_xattr_buf(ds->sc, 0,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (error == -ENOMEM)
return -EDEADLOCK;
if (error)
@@ -335,7 +336,7 @@ xchk_xattr_block(
bitmap_zero(usedmap, mp->m_attr_geo->blksize);
/* Check all the padding. */
- if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) {
+ if (xfs_has_crc(ds->sc->mp)) {
struct xfs_attr3_leafblock *leaf = bp->b_addr;
if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
@@ -474,7 +475,6 @@ xchk_xattr(
struct xfs_scrub *sc)
{
struct xchk_xattr sx;
- struct attrlist_cursor_kern cursor = { 0 };
xfs_dablk_t last_checked = -1U;
int error = 0;
@@ -493,11 +493,10 @@ xchk_xattr(
/* Check that every attr key can also be looked up by hash. */
sx.context.dp = sc->ip;
- sx.context.cursor = &cursor;
sx.context.resynch = 1;
sx.context.put_listent = xchk_xattr_listent;
sx.context.tp = sc->tp;
- sx.context.flags = ATTR_INCOMPLETE;
+ sx.context.allow_incomplete = true;
sx.sc = sc;
/*
@@ -516,7 +515,7 @@ xchk_xattr(
* iteration, which doesn't really follow the usual buffer
* locking order.
*/
- error = xfs_attr_list_int_ilocked(&sx.context);
+ error = xfs_attr_list_ilocked(&sx.context);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 13a1d2e8424d..3590e10e3e62 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -24,7 +24,7 @@ struct xchk_xattr_buf {
* space bitmap follows immediately after; and we have a third buffer
* for storing intermediate bitmap results.
*/
- uint8_t buf[0];
+ uint8_t buf[];
};
/* A place to store attribute values. */
@@ -65,7 +65,4 @@ xchk_xattr_dstmap(
BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
}
-int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size,
- xfs_km_flags_t flags);
-
#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 18a684e18a69..b89bf9de9b1c 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -18,14 +18,14 @@
* This is the logical equivalent of bitmap |= mask(start, len).
*/
int
-xfs_bitmap_set(
- struct xfs_bitmap *bitmap,
+xbitmap_set(
+ struct xbitmap *bitmap,
uint64_t start,
uint64_t len)
{
- struct xfs_bitmap_range *bmr;
+ struct xbitmap_range *bmr;
- bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL);
+ bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
if (!bmr)
return -ENOMEM;
@@ -39,13 +39,13 @@ xfs_bitmap_set(
/* Free everything related to this bitmap. */
void
-xfs_bitmap_destroy(
- struct xfs_bitmap *bitmap)
+xbitmap_destroy(
+ struct xbitmap *bitmap)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
- for_each_xfs_bitmap_extent(bmr, n, bitmap) {
+ for_each_xbitmap_extent(bmr, n, bitmap) {
list_del(&bmr->list);
kmem_free(bmr);
}
@@ -53,24 +53,24 @@ xfs_bitmap_destroy(
/* Set up a per-AG block bitmap. */
void
-xfs_bitmap_init(
- struct xfs_bitmap *bitmap)
+xbitmap_init(
+ struct xbitmap *bitmap)
{
INIT_LIST_HEAD(&bitmap->list);
}
/* Compare two btree extents. */
static int
-xfs_bitmap_range_cmp(
+xbitmap_range_cmp(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
- struct xfs_bitmap_range *ap;
- struct xfs_bitmap_range *bp;
+ struct xbitmap_range *ap;
+ struct xbitmap_range *bp;
- ap = container_of(a, struct xfs_bitmap_range, list);
- bp = container_of(b, struct xfs_bitmap_range, list);
+ ap = container_of(a, struct xbitmap_range, list);
+ bp = container_of(b, struct xbitmap_range, list);
if (ap->start > bp->start)
return 1;
@@ -96,14 +96,14 @@ xfs_bitmap_range_cmp(
#define LEFT_ALIGNED (1 << 0)
#define RIGHT_ALIGNED (1 << 1)
int
-xfs_bitmap_disunion(
- struct xfs_bitmap *bitmap,
- struct xfs_bitmap *sub)
+xbitmap_disunion(
+ struct xbitmap *bitmap,
+ struct xbitmap *sub)
{
struct list_head *lp;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *new_br;
- struct xfs_bitmap_range *sub_br;
+ struct xbitmap_range *br;
+ struct xbitmap_range *new_br;
+ struct xbitmap_range *sub_br;
uint64_t sub_start;
uint64_t sub_len;
int state;
@@ -113,8 +113,8 @@ xfs_bitmap_disunion(
return 0;
ASSERT(!list_empty(&sub->list));
- list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp);
- list_sort(NULL, &sub->list, xfs_bitmap_range_cmp);
+ list_sort(NULL, &bitmap->list, xbitmap_range_cmp);
+ list_sort(NULL, &sub->list, xbitmap_range_cmp);
/*
* Now that we've sorted both lists, we iterate bitmap once, rolling
@@ -124,11 +124,11 @@ xfs_bitmap_disunion(
* list traversal is similar to merge sort, but we're deleting
* instead. In this manner we avoid O(n^2) operations.
*/
- sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range,
+ sub_br = list_first_entry(&sub->list, struct xbitmap_range,
list);
lp = bitmap->list.next;
while (lp != &bitmap->list) {
- br = list_entry(lp, struct xfs_bitmap_range, list);
+ br = list_entry(lp, struct xbitmap_range, list);
/*
* Advance sub_br and/or br until we find a pair that
@@ -181,7 +181,7 @@ xfs_bitmap_disunion(
* Deleting from the middle: add the new right extent
* and then shrink the left extent.
*/
- new_br = kmem_alloc(sizeof(struct xfs_bitmap_range),
+ new_br = kmem_alloc(sizeof(struct xbitmap_range),
KM_MAYFAIL);
if (!new_br) {
error = -ENOMEM;
@@ -222,21 +222,21 @@ out:
* 1 2 3
*
* Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_ptrs[0] == 1, so we record
- * that we saw block 1. Then we observe that bc_ptrs[1] == 1, so we record
- * block 4. The list is [1, 4].
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
*
- * For the second btree record, we see that bc_ptrs[0] == 2, so we exit the
- * loop. The list remains [1, 4].
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
*
* For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_ptrs[0] == 1 again, so we record that we saw block 2. We see that
- * bc_ptrs[1] == 2, so we exit the loop. The list is now [1, 4, 2].
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
*
- * For the 102nd record, bc_ptrs[0] == 2, so we continue.
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
*
- * For the 201st record, we've moved on to leaf block 3. bc_ptrs[0] == 1, so
- * we add 3 to the list. Now it is [1, 4, 2, 3].
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
*
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
*/
@@ -247,8 +247,8 @@ out:
* blocks going from the leaf towards the root.
*/
int
-xfs_bitmap_set_btcur_path(
- struct xfs_bitmap *bitmap,
+xbitmap_set_btcur_path(
+ struct xbitmap *bitmap,
struct xfs_btree_cur *cur)
{
struct xfs_buf *bp;
@@ -256,12 +256,12 @@ xfs_bitmap_set_btcur_path(
int i;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) {
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
xfs_btree_get_block(cur, i, &bp);
if (!bp)
continue;
- fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
- error = xfs_bitmap_set(bitmap, fsb, 1);
+ fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ error = xbitmap_set(bitmap, fsb, 1);
if (error)
return error;
}
@@ -271,12 +271,12 @@ xfs_bitmap_set_btcur_path(
/* Collect a btree's block in the bitmap. */
STATIC int
-xfs_bitmap_collect_btblock(
+xbitmap_collect_btblock(
struct xfs_btree_cur *cur,
int level,
void *priv)
{
- struct xfs_bitmap *bitmap = priv;
+ struct xbitmap *bitmap = priv;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
@@ -284,16 +284,31 @@ xfs_bitmap_collect_btblock(
if (!bp)
return 0;
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
- return xfs_bitmap_set(bitmap, fsbno, 1);
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ return xbitmap_set(bitmap, fsbno, 1);
}
/* Walk the btree and mark the bitmap wherever a btree block is found. */
int
-xfs_bitmap_set_btblocks(
- struct xfs_bitmap *bitmap,
+xbitmap_set_btblocks(
+ struct xbitmap *bitmap,
struct xfs_btree_cur *cur)
{
- return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock,
+ return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
XFS_BTREE_VISIT_ALL, bitmap);
}
+
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap_hweight(
+ struct xbitmap *bitmap)
+{
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
+ uint64_t ret = 0;
+
+ for_each_xbitmap_extent(bmr, n, bitmap)
+ ret += bmr->len;
+
+ return ret;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index ae8ecbce6fa6..900646b72de1 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,31 +6,32 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xfs_bitmap_range {
+struct xbitmap_range {
struct list_head list;
uint64_t start;
uint64_t len;
};
-struct xfs_bitmap {
+struct xbitmap {
struct list_head list;
};
-void xfs_bitmap_init(struct xfs_bitmap *bitmap);
-void xfs_bitmap_destroy(struct xfs_bitmap *bitmap);
+void xbitmap_init(struct xbitmap *bitmap);
+void xbitmap_destroy(struct xbitmap *bitmap);
-#define for_each_xfs_bitmap_extent(bex, n, bitmap) \
+#define for_each_xbitmap_extent(bex, n, bitmap) \
list_for_each_entry_safe((bex), (n), &(bitmap)->list, list)
-#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \
+#define for_each_xbitmap_block(b, bex, n, bitmap) \
list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \
- for ((b) = bex->start; (b) < bex->start + bex->len; (b)++)
+ for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++)
-int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len);
-int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub);
-int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap,
+int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
+int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
+int xbitmap_set_btcur_path(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
-int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap,
+int xbitmap_set_btblocks(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
+uint64_t xbitmap_hweight(struct xbitmap *bitmap);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index fa6ea6407992..f0b9cb6506fd 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -22,16 +22,16 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/* Set us up with an inode's bmap. */
int
xchk_setup_inode_bmap(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
int error;
- error = xchk_get_inode(sc, ip);
+ error = xchk_get_inode(sc);
if (error)
goto out;
@@ -45,9 +45,27 @@ xchk_setup_inode_bmap(
*/
if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+ struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+
inode_dio_wait(VFS_I(sc->ip));
- error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
- if (error)
+
+ /*
+ * Try to flush all incore state to disk before we examine the
+ * space mappings for the data fork. Leave accumulated errors
+ * in the mapping for the writer threads to consume.
+ *
+ * On ENOSPC or EIO writeback errors, we continue into the
+ * extent mapping checks because write failures do not
+ * necessarily imply anything about the correctness of the file
+ * metadata. The metadata and the file data could be on
+ * completely separate devices; a media failure might only
+ * affect a subset of the disk, etc. We can handle delalloc
+ * extents in the scrubber, so leaving them in memory is fine.
+ */
+ error = filemap_fdatawrite(mapping);
+ if (!error)
+ error = filemap_fdatawait_keep_errors(mapping);
+ if (error && (error != -ENOSPC && error != -EIO))
goto out;
}
@@ -95,6 +113,8 @@ xchk_bmap_get_rmap(
if (info->whichfork == XFS_ATTR_FORK)
rflags |= XFS_RMAP_ATTR_FORK;
+ if (irec->br_state == XFS_EXT_UNWRITTEN)
+ rflags |= XFS_RMAP_UNWRITTEN;
/*
* CoW staging extents are owned (on disk) by the refcountbt, so
@@ -113,29 +133,13 @@ xchk_bmap_get_rmap(
if (info->is_shared) {
error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
owner, offset, rflags, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
- goto out;
+ } else {
+ error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+ owner, offset, rflags, rmap, &has_rmap);
}
-
- /*
- * Otherwise, use the (faster) regular lookup.
- */
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
- offset, rflags, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
- if (!has_rmap)
- goto out;
-
- error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
return false;
-out:
if (!has_rmap)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -198,13 +202,13 @@ xchk_bmap_xref_rmap(
* which doesn't track unwritten state.
*/
if (owner != XFS_RMAP_OWN_COW &&
- irec->br_state == XFS_EXT_UNWRITTEN &&
- !(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
+ !!(irec->br_state == XFS_EXT_UNWRITTEN) !=
+ !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- if (info->whichfork == XFS_ATTR_FORK &&
- !(rmap.rm_flags & XFS_RMAP_ATTR_FORK))
+ if (!!(info->whichfork == XFS_ATTR_FORK) !=
+ !!(rmap.rm_flags & XFS_RMAP_ATTR_FORK))
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
@@ -240,10 +244,10 @@ xchk_bmap_iextent_xref(
agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
len = irec->br_blockcount;
- error = xchk_ag_init(info->sc, agno, &info->sc->sa);
+ error = xchk_ag_init_existing(info->sc, agno, &info->sc->sa);
if (!xchk_fblock_process_error(info->sc, info->whichfork,
irec->br_startoff, &error))
- return;
+ goto out_free;
xchk_xref_is_used_space(info->sc, agbno, len);
xchk_xref_is_not_inode_chunk(info->sc, agbno, len);
@@ -252,7 +256,7 @@ xchk_bmap_iextent_xref(
case XFS_DATA_FORK:
if (xfs_is_reflink_inode(info->sc->ip))
break;
- /* fall through */
+ fallthrough;
case XFS_ATTR_FORK:
xchk_xref_is_not_shared(info->sc, agbno,
irec->br_blockcount);
@@ -263,6 +267,7 @@ xchk_bmap_iextent_xref(
break;
}
+out_free:
xchk_ag_free(info->sc, &info->sc->sa);
}
@@ -299,7 +304,6 @@ xchk_bmap_iextent(
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
- xfs_filblks_t end;
int error = 0;
/*
@@ -310,6 +314,10 @@ xchk_bmap_iextent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
xchk_bmap_dirattr_extent(ip, info, irec);
/* There should never be a "hole" extent in either extent list. */
@@ -326,23 +334,15 @@ xchk_bmap_iextent(
irec->br_startoff);
/* Make sure the extent points to a valid place. */
- if (irec->br_blockcount > MAXEXTLEN)
- xchk_fblock_set_corrupt(info->sc, info->whichfork,
- irec->br_startoff);
- if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock)
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- end = irec->br_startblock + irec->br_blockcount - 1;
if (info->is_rt &&
- (!xfs_verify_rtbno(mp, irec->br_startblock) ||
- !xfs_verify_rtbno(mp, end)))
+ !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (!info->is_rt &&
- (!xfs_verify_fsbno(mp, irec->br_startblock) ||
- !xfs_verify_fsbno(mp, end) ||
- XFS_FSB_TO_AGNO(mp, irec->br_startblock) !=
- XFS_FSB_TO_AGNO(mp, end)))
+ !xfs_verify_fsbext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -368,16 +368,16 @@ xchk_bmap_iextent(
STATIC int
xchk_bmapbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_bmbt_irec irec;
struct xfs_bmbt_irec iext_irec;
struct xfs_iext_cursor icur;
struct xchk_bmap_info *info = bs->private;
- struct xfs_inode *ip = bs->cur->bc_private.b.ip;
+ struct xfs_inode *ip = bs->cur->bc_ino.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork);
uint64_t owner;
int i;
@@ -385,8 +385,8 @@ xchk_bmapbt_rec(
* Check the owners of the btree blocks up to the level below
* the root since the verifiers don't do that.
*/
- if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) &&
- bs->cur->bc_ptrs[0] == 1) {
+ if (xfs_has_crc(bs->cur->bc_mp) &&
+ bs->cur->bc_levels[0].ptr == 1) {
for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
block = xfs_btree_get_block(bs->cur, i, &bp);
owner = be64_to_cpu(block->bb_u.l.bb_owner);
@@ -426,19 +426,18 @@ xchk_bmap_btree(
struct xchk_bmap_info *info)
{
struct xfs_owner_info oinfo;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
struct xfs_btree_cur *cur;
int error;
/* Load the incore bmap cache if it's not loaded. */
- info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
- if (!info->was_loaded) {
- error = xfs_iread_extents(sc->tp, ip, whichfork);
- if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
- goto out;
- }
+ info->was_loaded = !xfs_need_iread_extents(ifp);
+
+ error = xfs_iread_extents(sc->tp, ip, whichfork);
+ if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
/* Check the btree structure. */
cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
@@ -459,10 +458,11 @@ struct xchk_bmap_check_rmap_info {
STATIC int
xchk_bmap_check_rmap(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xfs_bmbt_irec irec;
+ struct xfs_rmap_irec check_rec;
struct xchk_bmap_check_rmap_info *sbcri = priv;
struct xfs_ifork *ifp;
struct xfs_scrub *sc = sbcri->sc;
@@ -478,7 +478,7 @@ xchk_bmap_check_rmap(
return 0;
/* Now look up the bmbt record. */
- ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork);
+ ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork);
if (!ifp) {
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
@@ -496,28 +496,30 @@ xchk_bmap_check_rmap(
* length, so we have to loop through the bmbt to make sure that the
* entire rmap is covered by bmbt records.
*/
+ check_rec = *rec;
while (have_map) {
- if (irec.br_startoff != rec->rm_offset)
+ if (irec.br_startoff != check_rec.rm_offset)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
+ check_rec.rm_offset);
if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_private.a.agno, rec->rm_startblock))
+ cur->bc_ag.pag->pag_agno,
+ check_rec.rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
- if (irec.br_blockcount > rec->rm_blockcount)
+ check_rec.rm_offset);
+ if (irec.br_blockcount > check_rec.rm_blockcount)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
+ check_rec.rm_offset);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
break;
- rec->rm_startblock += irec.br_blockcount;
- rec->rm_offset += irec.br_blockcount;
- rec->rm_blockcount -= irec.br_blockcount;
- if (rec->rm_blockcount == 0)
+ check_rec.rm_startblock += irec.br_blockcount;
+ check_rec.rm_offset += irec.br_blockcount;
+ check_rec.rm_blockcount -= irec.br_blockcount;
+ if (check_rec.rm_blockcount == 0)
break;
have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec);
if (!have_map)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
- rec->rm_offset);
+ check_rec.rm_offset);
}
out:
@@ -531,22 +533,18 @@ STATIC int
xchk_bmap_check_ag_rmaps(
struct xfs_scrub *sc,
int whichfork,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
struct xchk_bmap_check_rmap_info sbcri;
struct xfs_btree_cur *cur;
struct xfs_buf *agf;
int error;
- error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf);
if (error)
return error;
- cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno);
- if (!cur) {
- error = -ENOMEM;
- goto out_agf;
- }
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, pag);
sbcri.sc = sc;
sbcri.whichfork = whichfork;
@@ -555,7 +553,6 @@ xchk_bmap_check_ag_rmaps(
error = 0;
xfs_btree_del_cursor(cur, error);
-out_agf:
xfs_trans_brelse(sc->tp, agf);
return error;
}
@@ -566,11 +563,13 @@ xchk_bmap_check_rmaps(
struct xfs_scrub *sc,
int whichfork)
{
- loff_t size;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
+ bool zero_size;
int error;
- if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) ||
+ if (!xfs_has_rmapbt(sc->mp) ||
whichfork == XFS_COW_FORK ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return 0;
@@ -579,6 +578,8 @@ xchk_bmap_check_rmaps(
if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
return 0;
+ ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+
/*
* Only do this for complex maps that are in btree format, or for
* situations where we would seem to have a size but zero extents.
@@ -586,30 +587,26 @@ xchk_bmap_check_rmaps(
* to flag this bmap as corrupt if there are rmaps that need to be
* reattached.
*/
- switch (whichfork) {
- case XFS_DATA_FORK:
- size = i_size_read(VFS_I(sc->ip));
- break;
- case XFS_ATTR_FORK:
- size = XFS_IFORK_Q(sc->ip);
- break;
- default:
- size = 0;
- break;
- }
- if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0))
+
+ if (whichfork == XFS_DATA_FORK)
+ zero_size = i_size_read(VFS_I(sc->ip)) == 0;
+ else
+ zero_size = false;
+
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE &&
+ (zero_size || ifp->if_nextents > 0))
return 0;
- for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) {
- error = xchk_bmap_check_ag_rmaps(sc, whichfork, agno);
+ for_each_perag(sc->mp, agno, pag) {
+ error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error)
- return error;
+ break;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
break;
}
-
- return 0;
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -627,12 +624,14 @@ xchk_bmap(
struct xchk_bmap_info info = { NULL };
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t endoff;
struct xfs_iext_cursor icur;
int error = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* Non-existent forks can be ignored. */
+ if (!ifp)
+ goto out;
info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
info.whichfork = whichfork;
@@ -641,9 +640,6 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
- /* Non-existent CoW forks are ignorable. */
- if (!ifp)
- goto out;
/* No CoW forks on non-reflink inodes/filesystems. */
if (!xfs_is_reflink_inode(ip)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -651,10 +647,7 @@ xchk_bmap(
}
break;
case XFS_ATTR_FORK:
- if (!ifp)
- goto out_check_rmap;
- if (!xfs_sb_version_hasattr(&mp->m_sb) &&
- !xfs_sb_version_hasattr2(&mp->m_sb))
+ if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
break;
default:
@@ -663,17 +656,13 @@ xchk_bmap(
}
/* Check the fork values */
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_UUID:
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
/* No mappings to check. */
goto out;
case XFS_DINODE_FMT_EXTENTS:
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
- }
break;
case XFS_DINODE_FMT_BTREE:
if (whichfork == XFS_COW_FORK) {
@@ -700,7 +689,7 @@ xchk_bmap(
/* Scrub extent records. */
info.lastoff = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
@@ -717,7 +706,6 @@ xchk_bmap(
goto out;
}
-out_check_rmap:
error = xchk_bmap_check_rmaps(sc, whichfork);
if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index f52a7b8256f9..2f4519590dc1 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -9,6 +9,7 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -43,7 +44,7 @@ __xchk_btree_process_error(
/* Note the badness but don't abort. */
sc->sm->sm_flags |= errflag;
*error = 0;
- /* fall through */
+ fallthrough;
default:
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
trace_xchk_ifork_btree_op_error(sc, cur, level,
@@ -135,14 +136,14 @@ xchk_btree_rec(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, 0, &bp);
- rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
trace_xchk_btree_rec(bs->sc, cur, 0);
/* If this isn't the first record, are they in order? */
- if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ if (cur->bc_levels[0].ptr > 1 &&
+ !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
xchk_btree_set_corrupt(bs->sc, cur, 0);
- bs->firstrec = false;
memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
if (cur->bc_nlevels == 1)
@@ -151,7 +152,7 @@ xchk_btree_rec(
/* Is this at least as large as the parent low key? */
cur->bc_ops->init_key_from_rec(&key, rec);
keyblock = xfs_btree_get_block(cur, 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
@@ -160,7 +161,7 @@ xchk_btree_rec(
/* Is this no larger than the parent high key? */
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
xchk_btree_set_corrupt(bs->sc, cur, 1);
}
@@ -182,23 +183,22 @@ xchk_btree_key(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+ key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
trace_xchk_btree_key(bs->sc, cur, level);
/* If this isn't the first key, are they in order? */
- if (!bs->firstkey[level] &&
- !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ if (cur->bc_levels[level].ptr > 1 &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
xchk_btree_set_corrupt(bs->sc, cur, level);
- bs->firstkey[level] = false;
- memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+ memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
if (level + 1 >= cur->bc_nlevels)
return;
/* Is this at least as large as the parent low key? */
keyblock = xfs_btree_get_block(cur, level + 1, &bp);
- keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
@@ -206,8 +206,9 @@ xchk_btree_key(
return;
/* Is this no larger than the parent high key? */
- key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
- keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
+ keyblock);
if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
xchk_btree_set_corrupt(bs->sc, cur, level);
}
@@ -290,7 +291,7 @@ xchk_btree_block_check_sibling(
/* Compare upper level pointer to sibling pointer. */
pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
- pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
if (!xchk_btree_ptr_ok(bs, level + 1, pp))
goto out;
if (pbp)
@@ -373,10 +374,10 @@ xchk_btree_check_block_owner(
init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
if (init_sa) {
- error = xchk_ag_init(bs->sc, agno, &bs->sc->sa);
+ error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
level, &error))
- return error;
+ goto out_free;
}
xchk_xref_is_used_space(bs->sc, agbno, 1);
@@ -392,6 +393,7 @@ xchk_btree_check_block_owner(
if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
bs->cur = NULL;
+out_free:
if (init_sa)
xchk_ag_free(bs->sc, &bs->sc->sa);
@@ -434,12 +436,36 @@ xchk_btree_check_owner(
if (!co)
return -ENOMEM;
co->level = level;
- co->daddr = XFS_BUF_ADDR(bp);
+ co->daddr = xfs_buf_daddr(bp);
list_add_tail(&co->list, &bs->to_check);
return 0;
}
- return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp));
+ return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
+}
+
+/* Decide if we want to check minrecs of a btree block in the inode root. */
+static inline bool
+xchk_btree_check_iroot_minrecs(
+ struct xchk_btree *bs)
+{
+ /*
+ * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
+ * would miscalculate the space required for the data fork bmbt root
+ * when adding an attr fork, and promote the iroot contents to an
+ * external block unnecessarily. This went unnoticed for many years
+ * until scrub found filesystems in this state. Inode rooted btrees are
+ * not supposed to have immediate child blocks that are small enough
+ * that the contents could fit in the inode root, but we can't fail
+ * existing filesystems, so instead we disable the check for data fork
+ * bmap btrees when there's an attr fork.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+ bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
+ xfs_inode_has_attr_fork(bs->sc->ip))
+ return false;
+
+ return true;
}
/*
@@ -452,32 +478,42 @@ xchk_btree_check_minrecs(
int level,
struct xfs_btree_block *block)
{
- unsigned int numrecs;
- int ok_level;
-
- numrecs = be16_to_cpu(block->bb_numrecs);
+ struct xfs_btree_cur *cur = bs->cur;
+ unsigned int root_level = cur->bc_nlevels - 1;
+ unsigned int numrecs = be16_to_cpu(block->bb_numrecs);
/* More records than minrecs means the block is ok. */
- if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level))
+ if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
return;
/*
- * Certain btree blocks /can/ have fewer than minrecs records. Any
- * level greater than or equal to the level of the highest dedicated
- * btree block are allowed to violate this constraint.
- *
- * For a btree rooted in a block, the btree root can have fewer than
- * minrecs records. If the btree is rooted in an inode and does not
- * store records in the root, the direct children of the root and the
- * root itself can have fewer than minrecs records.
+ * For btrees rooted in the inode, it's possible that the root block
+ * contents spilled into a regular ondisk block because there wasn't
+ * enough space in the inode root. The number of records in that
+ * child block might be less than the standard minrecs, but that's ok
+ * provided that there's only one direct child of the root.
*/
- ok_level = bs->cur->bc_nlevels - 1;
- if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
- ok_level--;
- if (level >= ok_level)
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 2) {
+ struct xfs_btree_block *root_block;
+ struct xfs_buf *root_bp;
+ int root_maxrecs;
+
+ root_block = xfs_btree_get_block(cur, root_level, &root_bp);
+ root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
+ if (xchk_btree_check_iroot_minrecs(bs) &&
+ (be16_to_cpu(root_block->bb_numrecs) != 1 ||
+ numrecs <= root_maxrecs))
+ xchk_btree_set_corrupt(bs->sc, cur, level);
return;
+ }
- xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ /*
+ * Otherwise, only the root level is allowed to have fewer than minrecs
+ * records or keyptrs.
+ */
+ if (level < root_level)
+ xchk_btree_set_corrupt(bs->sc, cur, level);
}
/*
@@ -560,7 +596,7 @@ xchk_btree_block_keys(
/* Obtain the parent's copy of the keys for this block. */
parent_block = xfs_btree_get_block(cur, level + 1, &bp);
- parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
@@ -571,7 +607,7 @@ xchk_btree_block_keys(
/* Get high keys */
high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
- high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
@@ -591,35 +627,39 @@ xchk_btree(
const struct xfs_owner_info *oinfo,
void *private)
{
- struct xchk_btree bs = {
- .cur = cur,
- .scrub_rec = scrub_fn,
- .oinfo = oinfo,
- .firstrec = true,
- .private = private,
- .sc = sc,
- };
union xfs_btree_ptr ptr;
+ struct xchk_btree *bs;
union xfs_btree_ptr *pp;
union xfs_btree_rec *recp;
struct xfs_btree_block *block;
- int level;
struct xfs_buf *bp;
struct check_owner *co;
struct check_owner *n;
- int i;
+ size_t cur_sz;
+ int level;
int error = 0;
- /* Initialize scrub state */
- for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
- bs.firstkey[i] = true;
- INIT_LIST_HEAD(&bs.to_check);
-
- /* Don't try to check a tree with a height we can't handle. */
- if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ /*
+ * Allocate the btree scrub context from the heap, because this
+ * structure can get rather large. Don't let a caller feed us a
+ * totally absurd size.
+ */
+ cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
+ if (cur_sz > PAGE_SIZE) {
xchk_btree_set_corrupt(sc, cur, 0);
- goto out;
+ return 0;
}
+ bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ if (!bs)
+ return -ENOMEM;
+ bs->cur = cur;
+ bs->scrub_rec = scrub_fn;
+ bs->oinfo = oinfo;
+ bs->private = private;
+ bs->sc = sc;
+
+ /* Initialize scrub state */
+ INIT_LIST_HEAD(&bs->to_check);
/*
* Load the root of the btree. The helper function absorbs
@@ -627,79 +667,82 @@ xchk_btree(
*/
level = cur->bc_nlevels - 1;
cur->bc_ops->init_ptr_from_cur(cur, &ptr);
- if (!xchk_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
goto out;
- error = xchk_btree_get_block(&bs, level, &ptr, &block, &bp);
+ error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
while (level < cur->bc_nlevels) {
block = xfs_btree_get_block(cur, level, &bp);
if (level == 0) {
/* End of leaf, pop back towards the root. */
- if (cur->bc_ptrs[level] >
+ if (cur->bc_levels[level].ptr >
be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Records in order for scrub? */
- xchk_btree_rec(&bs);
+ xchk_btree_rec(bs);
/* Call out to the record checker. */
- recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
- error = bs.scrub_rec(&bs, recp);
+ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
+ block);
+ error = bs->scrub_rec(bs, recp);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
- cur->bc_ptrs[level]++;
+ cur->bc_levels[level].ptr++;
continue;
}
/* End of node, pop back towards the root. */
- if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
- xchk_btree_block_keys(&bs, level, block);
+ if (cur->bc_levels[level].ptr >
+ be16_to_cpu(block->bb_numrecs)) {
+ xchk_btree_block_keys(bs, level, block);
if (level < cur->bc_nlevels - 1)
- cur->bc_ptrs[level + 1]++;
+ cur->bc_levels[level + 1].ptr++;
level++;
continue;
}
/* Keys in order for scrub? */
- xchk_btree_key(&bs, level);
+ xchk_btree_key(bs, level);
/* Drill another level deeper. */
- pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
- if (!xchk_btree_ptr_ok(&bs, level, pp)) {
- cur->bc_ptrs[level]++;
+ pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
+ if (!xchk_btree_ptr_ok(bs, level, pp)) {
+ cur->bc_levels[level].ptr++;
continue;
}
level--;
- error = xchk_btree_get_block(&bs, level, pp, &block, &bp);
+ error = xchk_btree_get_block(bs, level, pp, &block, &bp);
if (error || !block)
goto out;
- cur->bc_ptrs[level] = 1;
+ cur->bc_levels[level].ptr = 1;
}
out:
/* Process deferred owner checks on btree blocks. */
- list_for_each_entry_safe(co, n, &bs.to_check, list) {
- if (!error && bs.cur)
- error = xchk_btree_check_block_owner(&bs,
- co->level, co->daddr);
+ list_for_each_entry_safe(co, n, &bs->to_check, list) {
+ if (!error && bs->cur)
+ error = xchk_btree_check_block_owner(bs, co->level,
+ co->daddr);
list_del(&co->list);
kmem_free(co);
}
+ kmem_free(bs);
return error;
}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index 5572e475f8ed..da61a53a0b61 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -26,8 +26,8 @@ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,
struct xchk_btree;
typedef int (*xchk_btree_rec_fn)(
- struct xchk_btree *bs,
- union xfs_btree_rec *rec);
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec);
struct xchk_btree {
/* caller-provided scrub state */
@@ -39,11 +39,22 @@ struct xchk_btree {
/* internal scrub state */
union xfs_btree_rec lastrec;
- bool firstrec;
- union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
- bool firstkey[XFS_BTREE_MAXLEVELS];
struct list_head to_check;
+
+ /* this element must come last! */
+ union xfs_btree_key lastkey[];
};
+
+/*
+ * Calculate the size of a xchk_btree structure. There are nlevels-1 slots for
+ * keys because we track leaf records separately in lastrec.
+ */
+static inline size_t
+xchk_btree_sizeof(unsigned int nlevels)
+{
+ return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1);
+}
+
int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
void *private);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 18876056e5e0..9bbbf20f401b 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -12,7 +12,6 @@
#include "xfs_btree.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_alloc.h"
@@ -24,8 +23,11 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -74,14 +76,16 @@ __xchk_process_error(
return true;
case -EDEADLOCK:
/* Used to restart an op with deadlock avoidance. */
- trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
+ trace_xchk_deadlock_retry(
+ sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
+ sc->sm, *error);
break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
sc->sm->sm_flags |= errflag;
*error = 0;
- /* fall through */
+ fallthrough;
default:
trace_xchk_op_error(sc, agno, bno, *error,
ret_ip);
@@ -134,7 +138,7 @@ __xchk_fblock_process_error(
/* Note the badness but don't abort. */
sc->sm->sm_flags |= errflag;
*error = 0;
- /* fall through */
+ fallthrough;
default:
trace_xchk_file_op_error(sc, whichfork, offset, *error,
ret_ip);
@@ -184,7 +188,7 @@ xchk_block_set_preen(
struct xfs_buf *bp)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
- trace_xchk_block_preen(sc, bp->b_bn, __return_address);
+ trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
}
/*
@@ -217,7 +221,7 @@ xchk_block_set_corrupt(
struct xfs_buf *bp)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- trace_xchk_block_error(sc, bp->b_bn, __return_address);
+ trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
/* Record a corruption while cross-referencing. */
@@ -227,7 +231,7 @@ xchk_block_xref_set_corrupt(
struct xfs_buf *bp)
{
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
- trace_xchk_block_error(sc, bp->b_bn, __return_address);
+ trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
}
/*
@@ -322,7 +326,7 @@ struct xchk_rmap_ownedby_info {
STATIC int
xchk_count_rmap_ownedby_irec(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xchk_rmap_ownedby_info *sroi = priv;
@@ -392,37 +396,39 @@ want_ag_read_header_failure(
}
/*
- * Grab all the headers for an AG.
+ * Grab the perag structure and all the headers for an AG.
*
- * The headers should be released by xchk_ag_free, but as a fail
- * safe we attach all the buffers we grab to the scrub transaction so
- * they'll all be freed when we cancel it.
+ * The headers should be released by xchk_ag_free, but as a fail safe we attach
+ * all the buffers we grab to the scrub transaction so they'll all be freed
+ * when we cancel it. Returns ENOENT if we can't grab the perag structure.
*/
int
xchk_ag_read_headers(
struct xfs_scrub *sc,
xfs_agnumber_t agno,
- struct xfs_buf **agi,
- struct xfs_buf **agf,
- struct xfs_buf **agfl)
+ struct xchk_ag *sa)
{
struct xfs_mount *mp = sc->mp;
int error;
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi);
+ ASSERT(!sa->pag);
+ sa->pag = xfs_perag_get(mp, agno);
+ if (!sa->pag)
+ return -ENOENT;
+
+ error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
- goto out;
+ return error;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf);
+ error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
- goto out;
+ return error;
- error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl);
+ error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
- goto out;
- error = 0;
-out:
- return error;
+ return error;
+
+ return 0;
}
/* Release all the AG btree cursors. */
@@ -452,72 +458,54 @@ xchk_ag_btcur_free(
}
/* Initialize all the btree cursors for an AG. */
-int
+void
xchk_ag_btcur_init(
struct xfs_scrub *sc,
struct xchk_ag *sa)
{
struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sa->agno;
- xchk_perag_get(sc->mp, sa);
if (sa->agf_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
/* Set up a bnobt cursor for cross-referencing. */
sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- agno, XFS_BTNUM_BNO);
- if (!sa->bno_cur)
- goto err;
+ sa->pag, XFS_BTNUM_BNO);
}
if (sa->agf_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
- agno, XFS_BTNUM_CNT);
- if (!sa->cnt_cur)
- goto err;
+ sa->pag, XFS_BTNUM_CNT);
}
/* Set up a inobt cursor for cross-referencing. */
if (sa->agi_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
- agno, XFS_BTNUM_INO);
- if (!sa->ino_cur)
- goto err;
+ sa->pag, XFS_BTNUM_INO);
}
/* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ if (sa->agi_bp && xfs_has_finobt(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
- agno, XFS_BTNUM_FINO);
- if (!sa->fino_cur)
- goto err;
+ sa->pag, XFS_BTNUM_FINO);
}
/* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ if (sa->agf_bp && xfs_has_rmapbt(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
- agno);
- if (!sa->rmap_cur)
- goto err;
+ sa->pag);
}
/* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) &&
+ if (sa->agf_bp && xfs_has_reflink(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
- sa->agf_bp, agno);
- if (!sa->refc_cur)
- goto err;
+ sa->agf_bp, sa->pag);
}
-
- return 0;
-err:
- return -ENOMEM;
}
/* Release the AG header context and btree cursors. */
@@ -543,15 +531,14 @@ xchk_ag_free(
xfs_perag_put(sa->pag);
sa->pag = NULL;
}
- sa->agno = NULLAGNUMBER;
}
/*
- * For scrub, grab the AGI and the AGF headers, in that order. Locking
- * order requires us to get the AGI before the AGF. We use the
- * transaction to avoid deadlocking on crosslinked metadata buffers;
- * either the caller passes one in (bmap scrub) or we have to create a
- * transaction ourselves.
+ * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
+ * order. Locking order requires us to get the AGI before the AGF. We use the
+ * transaction to avoid deadlocking on crosslinked metadata buffers; either the
+ * caller passes one in (bmap scrub) or we have to create a transaction
+ * ourselves. Returns ENOENT if the perag struct cannot be grabbed.
*/
int
xchk_ag_init(
@@ -561,26 +548,12 @@ xchk_ag_init(
{
int error;
- sa->agno = agno;
- error = xchk_ag_read_headers(sc, agno, &sa->agi_bp,
- &sa->agf_bp, &sa->agfl_bp);
+ error = xchk_ag_read_headers(sc, agno, sa);
if (error)
return error;
- return xchk_ag_btcur_init(sc, sa);
-}
-
-/*
- * Grab the per-ag structure if we haven't already gotten it. Teardown of the
- * xchk_ag will release it for us.
- */
-void
-xchk_perag_get(
- struct xfs_mount *mp,
- struct xchk_ag *sa)
-{
- if (!sa->pag)
- sa->pag = xfs_perag_get(mp, sa->agno);
+ xchk_ag_btcur_init(sc, sa);
+ return 0;
}
/* Per-scrubber setup functions */
@@ -610,8 +583,7 @@ xchk_trans_alloc(
/* Set us up with a transaction and an empty context. */
int
xchk_setup_fs(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
uint resblks;
@@ -623,7 +595,6 @@ xchk_setup_fs(
int
xchk_setup_ag_btree(
struct xfs_scrub *sc,
- struct xfs_inode *ip,
bool force_log)
{
struct xfs_mount *mp = sc->mp;
@@ -641,7 +612,7 @@ xchk_setup_ag_btree(
return error;
}
- error = xchk_setup_fs(sc, ip);
+ error = xchk_setup_fs(sc);
if (error)
return error;
@@ -669,11 +640,11 @@ xchk_checkpoint_log(
*/
int
xchk_get_inode(
- struct xfs_scrub *sc,
- struct xfs_inode *ip_in)
+ struct xfs_scrub *sc)
{
struct xfs_imap imap;
struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
struct xfs_inode *ip = NULL;
int error;
@@ -713,7 +684,7 @@ xchk_get_inode(
if (error)
return -ENOENT;
error = -EFSCORRUPTED;
- /* fall through */
+ fallthrough;
default:
trace_xchk_op_error(sc,
XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
@@ -734,12 +705,11 @@ xchk_get_inode(
int
xchk_setup_inode_contents(
struct xfs_scrub *sc,
- struct xfs_inode *ip,
unsigned int resblks)
{
int error;
- error = xchk_get_inode(sc, ip);
+ error = xchk_get_inode(sc);
if (error)
return error;
@@ -816,7 +786,7 @@ xchk_buffer_recheck(
if (!fa)
return;
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- trace_xchk_block_error(sc, bp->b_bn, fa);
+ trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
}
/*
@@ -835,7 +805,7 @@ xchk_metadata_inode_forks(
return 0;
/* Metadata inodes don't live on the rt device. */
- if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}
@@ -861,7 +831,7 @@ xchk_metadata_inode_forks(
return error;
/* Look for incorrect shared blocks. */
- if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+ if (xfs_has_reflink(sc->mp)) {
error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
&shared);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
@@ -902,7 +872,8 @@ xchk_stop_reaping(
struct xfs_scrub *sc)
{
sc->flags |= XCHK_REAPING_DISABLED;
- xfs_stop_block_reaping(sc->mp);
+ xfs_blockgc_stop(sc->mp);
+ xfs_inodegc_stop(sc->mp);
}
/* Restart background reaping of resources. */
@@ -910,6 +881,13 @@ void
xchk_start_reaping(
struct xfs_scrub *sc)
{
- xfs_start_block_reaping(sc->mp);
+ /*
+ * Readonly filesystems do not perform inactivation or speculative
+ * preallocation, so there's no need to restart the workers.
+ */
+ if (!xfs_is_readonly(sc->mp)) {
+ xfs_inodegc_start(sc->mp);
+ xfs_blockgc_start(sc->mp);
+ }
sc->flags &= ~XCHK_REAPING_DISABLED;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 2e50d146105d..454145db10e7 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -72,66 +72,68 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
struct xfs_btree_cur **curpp);
/* Setup functions */
-int xchk_setup_fs(struct xfs_scrub *sc, struct xfs_inode *ip);
-int xchk_setup_ag_allocbt(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_ag_iallocbt(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_ag_rmapbt(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_ag_refcountbt(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_inode(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_inode_bmap(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_inode_bmap_data(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_directory(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_xattr(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_symlink(struct xfs_scrub *sc,
- struct xfs_inode *ip);
-int xchk_setup_parent(struct xfs_scrub *sc,
- struct xfs_inode *ip);
+int xchk_setup_fs(struct xfs_scrub *sc);
+int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
+int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
+int xchk_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xchk_setup_ag_refcountbt(struct xfs_scrub *sc);
+int xchk_setup_inode(struct xfs_scrub *sc);
+int xchk_setup_inode_bmap(struct xfs_scrub *sc);
+int xchk_setup_inode_bmap_data(struct xfs_scrub *sc);
+int xchk_setup_directory(struct xfs_scrub *sc);
+int xchk_setup_xattr(struct xfs_scrub *sc);
+int xchk_setup_symlink(struct xfs_scrub *sc);
+int xchk_setup_parent(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
-int xchk_setup_rt(struct xfs_scrub *sc, struct xfs_inode *ip);
+int xchk_setup_rt(struct xfs_scrub *sc);
#else
static inline int
-xchk_setup_rt(struct xfs_scrub *sc, struct xfs_inode *ip)
+xchk_setup_rt(struct xfs_scrub *sc)
{
return -ENOENT;
}
#endif
#ifdef CONFIG_XFS_QUOTA
-int xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip);
+int xchk_setup_quota(struct xfs_scrub *sc);
#else
static inline int
-xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip)
+xchk_setup_quota(struct xfs_scrub *sc)
{
return -ENOENT;
}
#endif
-int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip);
+int xchk_setup_fscounters(struct xfs_scrub *sc);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
-void xchk_perag_get(struct xfs_mount *mp, struct xchk_ag *sa);
+
+/*
+ * Grab all AG resources, treating the inability to grab the perag structure as
+ * a fs corruption. This is intended for callers checking an ondisk reference
+ * to a given AG, which means that the AG must still exist.
+ */
+static inline int
+xchk_ag_init_existing(
+ struct xfs_scrub *sc,
+ xfs_agnumber_t agno,
+ struct xchk_ag *sa)
+{
+ int error = xchk_ag_init(sc, agno, sa);
+
+ return error == -ENOENT ? -EFSCORRUPTED : error;
+}
+
int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
- struct xfs_buf **agi, struct xfs_buf **agf,
- struct xfs_buf **agfl);
+ struct xchk_ag *sa);
void xchk_ag_btcur_free(struct xchk_ag *sa);
-int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+void xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
-int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip,
- bool force_log);
-int xchk_get_inode(struct xfs_scrub *sc, struct xfs_inode *ip_in);
-int xchk_setup_inode_contents(struct xfs_scrub *sc, struct xfs_inode *ip,
- unsigned int resblks);
+int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
+int xchk_get_inode(struct xfs_scrub *sc);
+int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
/*
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 97a15b6f2865..84fe3d33d699 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -47,7 +47,7 @@ xchk_da_process_error(
/* Note the badness but don't abort. */
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
*error = 0;
- /* fall through */
+ fallthrough;
default:
trace_xchk_file_op_error(sc, ds->dargs.whichfork,
xfs_dir2_da_to_db(ds->dargs.geo,
@@ -219,19 +219,21 @@ xchk_da_btree_block_check_sibling(
int direction,
xfs_dablk_t sibling)
{
+ struct xfs_da_state_path *path = &ds->state->path;
+ struct xfs_da_state_path *altpath = &ds->state->altpath;
int retval;
+ int plevel;
int error;
- memcpy(&ds->state->altpath, &ds->state->path,
- sizeof(ds->state->altpath));
+ memcpy(altpath, path, sizeof(ds->state->altpath));
/*
* If the pointer is null, we shouldn't be able to move the upper
* level pointer anywhere.
*/
if (sibling == 0) {
- error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
- direction, false, &retval);
+ error = xfs_da3_path_shift(ds->state, altpath, direction,
+ false, &retval);
if (error == 0 && retval == 0)
xchk_da_set_corrupt(ds, level);
error = 0;
@@ -239,27 +241,33 @@ xchk_da_btree_block_check_sibling(
}
/* Move the alternate cursor one block in the direction given. */
- error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
- direction, false, &retval);
+ error = xfs_da3_path_shift(ds->state, altpath, direction, false,
+ &retval);
if (!xchk_da_process_error(ds, level, &error))
- return error;
+ goto out;
if (retval) {
xchk_da_set_corrupt(ds, level);
- return error;
+ goto out;
}
- if (ds->state->altpath.blk[level].bp)
- xchk_buffer_recheck(ds->sc,
- ds->state->altpath.blk[level].bp);
+ if (altpath->blk[level].bp)
+ xchk_buffer_recheck(ds->sc, altpath->blk[level].bp);
/* Compare upper level pointer to sibling pointer. */
- if (ds->state->altpath.blk[level].blkno != sibling)
+ if (altpath->blk[level].blkno != sibling)
xchk_da_set_corrupt(ds, level);
- if (ds->state->altpath.blk[level].bp) {
- xfs_trans_brelse(ds->dargs.trans,
- ds->state->altpath.blk[level].bp);
- ds->state->altpath.blk[level].bp = NULL;
- }
+
out:
+ /* Free all buffers in the altpath that aren't referenced from path. */
+ for (plevel = 0; plevel < altpath->active; plevel++) {
+ if (altpath->blk[plevel].bp == NULL ||
+ (plevel < path->active &&
+ altpath->blk[plevel].bp == path->blk[plevel].bp))
+ continue;
+
+ xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp);
+ altpath->blk[plevel].bp = NULL;
+ }
+
return error;
}
@@ -359,11 +367,11 @@ xchk_da_btree_block(
pmaxrecs = &ds->maxrecs[level];
/* We only started zeroing the header on v5 filesystems. */
- if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad)
+ if (xfs_has_crc(ds->sc->mp) && hdr3->hdr.pad)
xchk_da_set_corrupt(ds, level);
/* Check the owner. */
- if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) {
+ if (xfs_has_crc(ip->i_mount)) {
owner = be64_to_cpu(hdr3->owner);
if (owner != ip->i_ino)
xchk_da_set_corrupt(ds, level);
@@ -433,6 +441,20 @@ xchk_da_btree_block(
goto out_freebp;
}
+ /*
+ * If we've been handed a block that is below the dabtree root, does
+ * its hashval match what the parent block expected to see?
+ */
+ if (level > 0) {
+ struct xfs_da_node_entry *key;
+
+ key = xchk_da_btree_node_entry(ds, level - 1);
+ if (be32_to_cpu(key->hashval) != blk->hashval) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
out:
return error;
out_freebp:
@@ -451,7 +473,7 @@ xchk_da_btree(
xchk_da_btree_rec_fn scrub_fn,
void *private)
{
- struct xchk_da_btree ds = {};
+ struct xchk_da_btree *ds;
struct xfs_mount *mp = sc->mp;
struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *key;
@@ -460,38 +482,39 @@ xchk_da_btree(
int error;
/* Skip short format data structures; no btree to scan. */
- if (!xfs_ifork_has_extents(sc->ip, whichfork))
+ if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork)))
return 0;
/* Set up initial da state. */
- ds.dargs.dp = sc->ip;
- ds.dargs.whichfork = whichfork;
- ds.dargs.trans = sc->tp;
- ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
- ds.state = xfs_da_state_alloc();
- ds.state->args = &ds.dargs;
- ds.state->mp = mp;
- ds.sc = sc;
- ds.private = private;
+ ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ if (!ds)
+ return -ENOMEM;
+ ds->dargs.dp = sc->ip;
+ ds->dargs.whichfork = whichfork;
+ ds->dargs.trans = sc->tp;
+ ds->dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds->state = xfs_da_state_alloc(&ds->dargs);
+ ds->sc = sc;
+ ds->private = private;
if (whichfork == XFS_ATTR_FORK) {
- ds.dargs.geo = mp->m_attr_geo;
- ds.lowest = 0;
- ds.highest = 0;
+ ds->dargs.geo = mp->m_attr_geo;
+ ds->lowest = 0;
+ ds->highest = 0;
} else {
- ds.dargs.geo = mp->m_dir_geo;
- ds.lowest = ds.dargs.geo->leafblk;
- ds.highest = ds.dargs.geo->freeblk;
+ ds->dargs.geo = mp->m_dir_geo;
+ ds->lowest = ds->dargs.geo->leafblk;
+ ds->highest = ds->dargs.geo->freeblk;
}
- blkno = ds.lowest;
+ blkno = ds->lowest;
level = 0;
/* Find the root of the da tree, if present. */
- blks = ds.state->path.blk;
- error = xchk_da_btree_block(&ds, level, blkno);
+ blks = ds->state->path.blk;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out_state;
/*
- * We didn't find a block at ds.lowest, which means that there's
+ * We didn't find a block at ds->lowest, which means that there's
* no LEAF1/LEAFN tree (at least not where it's supposed to be),
* so jump out now.
*/
@@ -503,16 +526,16 @@ xchk_da_btree(
/* Handle leaf block. */
if (blks[level].magic != XFS_DA_NODE_MAGIC) {
/* End of leaf, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Dispatch record scrubbing. */
- error = scrub_fn(&ds, level);
+ error = scrub_fn(ds, level);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
@@ -525,17 +548,17 @@ xchk_da_btree(
/* End of node, pop back towards the root. */
- if (blks[level].index >= ds.maxrecs[level]) {
+ if (blks[level].index >= ds->maxrecs[level]) {
if (level > 0)
blks[level - 1].index++;
- ds.tree_level++;
+ ds->tree_level++;
level--;
continue;
}
/* Hashes in order for scrub? */
- key = xchk_da_btree_node_entry(&ds, level);
- error = xchk_da_btree_hash(&ds, level, &key->hashval);
+ key = xchk_da_btree_node_entry(ds, level);
+ error = xchk_da_btree_hash(ds, level, &key->hashval);
if (error)
goto out;
@@ -544,11 +567,11 @@ xchk_da_btree(
level++;
if (level >= XFS_DA_NODE_MAXDEPTH) {
/* Too deep! */
- xchk_da_set_corrupt(&ds, level - 1);
+ xchk_da_set_corrupt(ds, level - 1);
break;
}
- ds.tree_level--;
- error = xchk_da_btree_block(&ds, level, blkno);
+ ds->tree_level--;
+ error = xchk_da_btree_block(ds, level, blkno);
if (error)
goto out;
if (blks[level].bp == NULL)
@@ -567,6 +590,7 @@ out:
}
out_state:
- xfs_da_state_free(ds.state);
+ xfs_da_state_free(ds->state);
+ kmem_free(ds);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 266da4e4bde6..5c87800ab223 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -22,10 +22,9 @@
/* Set us up to scrub directories. */
int
xchk_setup_directory(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- return xchk_setup_inode_contents(sc, ip, 0);
+ return xchk_setup_inode_contents(sc, 0);
}
/* Directories */
@@ -52,7 +51,7 @@ xchk_dir_check_ftype(
int ino_dtype;
int error = 0;
- if (!xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (!xfs_has_ftype(mp)) {
if (dtype != DT_UNKNOWN && dtype != DT_DIR)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
@@ -66,8 +65,18 @@ xchk_dir_check_ftype(
* eofblocks cleanup (which allocates what would be a nested
* transaction), we can't use DONTCACHE here because DONTCACHE
* inodes can trigger immediate inactive cleanup of the inode.
+ *
+ * If _iget returns -EINVAL or -ENOENT then the child inode number is
+ * garbage and the directory is corrupt. If the _iget returns
+ * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
+ * cross referencing error. Any other error is an operational error.
*/
error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ error = -EFSCORRUPTED;
+ xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error);
+ goto out;
+ }
if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset,
&error))
goto out;
@@ -90,7 +99,7 @@ out:
* we check the inode number to make sure it's sane, then we check that
* we can look up this filename. Finally, we check the ftype.
*/
-STATIC int
+STATIC bool
xchk_dir_actor(
struct dir_context *dir_iter,
const char *name,
@@ -105,6 +114,7 @@ xchk_dir_actor(
struct xfs_name xname;
xfs_ino_t lookup_ino;
xfs_dablk_t offset;
+ bool checked_ftype = false;
int error = 0;
sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter);
@@ -114,7 +124,7 @@ xchk_dir_actor(
xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
if (xchk_should_terminate(sdc->sc, &error))
- return error;
+ return !error;
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
@@ -130,9 +140,10 @@ xchk_dir_actor(
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
- if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ if (xfs_has_ftype(mp) && type != DT_DIR)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
+ checked_ftype = true;
if (ino != ip->i_ino)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
@@ -141,9 +152,10 @@ xchk_dir_actor(
* If this is ".." in the root inode, check that the inum
* matches this dir.
*/
- if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ if (xfs_has_ftype(mp) && type != DT_DIR)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
+ checked_ftype = true;
if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
offset);
@@ -155,6 +167,9 @@ xchk_dir_actor(
xname.type = XFS_DIR3_FT_UNKNOWN;
error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+ /* ENOENT means the hash lookup failed and the dir is corrupt */
+ if (error == -ENOENT)
+ error = -EFSCORRUPTED;
if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
&error))
goto out;
@@ -164,9 +179,11 @@ xchk_dir_actor(
}
/* Verify the file type. This function absorbs error codes. */
- error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type);
- if (error)
- goto out;
+ if (!checked_ftype) {
+ error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type);
+ if (error)
+ goto out;
+ }
out:
/*
* A negative error code returned here is supposed to cause the
@@ -174,8 +191,8 @@ out:
* and return zero to xchk_directory.
*/
if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return -EFSCORRUPTED;
- return error;
+ return false;
+ return !error;
}
/* Scrub a directory btree record. */
@@ -480,6 +497,7 @@ STATIC int
xchk_directory_leaf1_bestfree(
struct xfs_scrub *sc,
struct xfs_da_args *args,
+ xfs_dir2_db_t last_data_db,
xfs_dablk_t lblk)
{
struct xfs_dir3_icleaf_hdr leafhdr;
@@ -500,7 +518,7 @@ xchk_directory_leaf1_bestfree(
/* Read the free space block. */
error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
- goto out;
+ return error;
xchk_buffer_recheck(sc, bp);
leaf = bp->b_addr;
@@ -509,7 +527,7 @@ xchk_directory_leaf1_bestfree(
bestcount = be32_to_cpu(ltp->bestcount);
bestp = xfs_dir2_leaf_bests_p(ltp);
- if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ if (xfs_has_crc(sc->mp)) {
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
if (hdr3->pad != cpu_to_be32(0))
@@ -517,10 +535,14 @@ xchk_directory_leaf1_bestfree(
}
/*
- * There should be as many bestfree slots as there are dir data
- * blocks that can fit under i_size.
+ * There must be enough bestfree slots to cover all the directory data
+ * blocks that we scanned. It is possible for there to be a hole
+ * between the last data block and i_disk_size. This seems like an
+ * oversight to the scrub author, but as we have been writing out
+ * directories like this (and xfs_repair doesn't mind them) for years,
+ * that's what we have to check.
*/
- if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) {
+ if (bestcount != last_data_db + 1) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
@@ -555,19 +577,33 @@ xchk_directory_leaf1_bestfree(
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
best = be16_to_cpu(*bestp);
- if (best == NULLDATAOFF)
- continue;
error = xfs_dir3_data_read(sc->tp, sc->ip,
- i * args->geo->fsbcount, 0, &dbp);
+ xfs_dir2_db_to_da(args->geo, i),
+ XFS_DABUF_MAP_HOLE_OK,
+ &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
- xchk_directory_check_freesp(sc, lblk, dbp, best);
+
+ if (!dbp) {
+ if (best != NULLDATAOFF) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ lblk);
+ break;
+ }
+ continue;
+ }
+
+ if (best == NULLDATAOFF)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ else
+ xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ break;
}
out:
+ xfs_trans_brelse(sc->tp, bp);
return error;
}
@@ -589,10 +625,10 @@ xchk_directory_free_bestfree(
/* Read the free space block */
error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
- goto out;
+ return error;
xchk_buffer_recheck(sc, bp);
- if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ if (xfs_has_crc(sc->mp)) {
struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
if (hdr3->pad != cpu_to_be32(0))
@@ -612,7 +648,7 @@ xchk_directory_free_bestfree(
0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- break;
+ goto out;
xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
}
@@ -620,6 +656,7 @@ xchk_directory_free_bestfree(
if (freehdr.nused + stale != freehdr.nvalid)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
out:
+ xfs_trans_brelse(sc->tp, bp);
return error;
}
@@ -630,23 +667,23 @@ xchk_directory_blocks(
{
struct xfs_bmbt_irec got;
struct xfs_da_args args;
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
xfs_fileoff_t free_lblk;
xfs_fileoff_t lblk;
struct xfs_iext_cursor icur;
xfs_dablk_t dabno;
+ xfs_dir2_db_t last_data_db = 0;
bool found;
- int is_block = 0;
+ bool is_block = false;
int error;
/* Ignore local format directories. */
- if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
return 0;
- ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
@@ -662,15 +699,6 @@ xchk_directory_blocks(
/* Iterate all the data extents in the directory... */
found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
- /* Block directories only have a single block at offset 0. */
- if (is_block &&
- (got.br_startoff > 0 ||
- got.br_blockcount != args.geo->fsbcount)) {
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
- got.br_startoff);
- break;
- }
-
/* No more data blocks... */
if (got.br_startoff >= leaf_lblk)
break;
@@ -690,6 +718,7 @@ xchk_directory_blocks(
args.geo->fsbcount);
lblk < got.br_startoff + got.br_blockcount;
lblk += args.geo->fsbcount) {
+ last_data_db = xfs_dir2_da_to_db(args.geo, lblk);
error = xchk_directory_data_bestfree(sc, lblk,
is_block);
if (error)
@@ -712,7 +741,7 @@ xchk_directory_blocks(
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
- error = xchk_directory_leaf1_bestfree(sc, &args,
+ error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db,
leaf_lblk);
if (error)
goto out;
@@ -785,7 +814,7 @@ xchk_directory(
return -ENOENT;
/* Plausible size? */
- if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) {
+ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
goto out;
}
@@ -811,7 +840,7 @@ xchk_directory(
* Userspace usually asks for a 32k buffer, so we will too.
*/
bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
- sc->ip->i_d.di_size);
+ sc->ip->i_disk_size);
/*
* Look up every name in this directory by hash.
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ec2064ed3c30..6a6f8fe7f87c 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -9,10 +9,11 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_health.h"
+#include "xfs_btree.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -70,17 +71,17 @@ xchk_fscount_warmup(
xfs_agnumber_t agno;
int error = 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
-
+ for_each_perag(mp, agno, pag) {
+ if (xchk_should_terminate(sc, &error))
+ break;
if (pag->pagi_init && pag->pagf_init)
- goto next_loop_perag;
+ continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
if (error)
break;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
if (error)
break;
@@ -88,21 +89,15 @@ xchk_fscount_warmup(
* These are supposed to be initialized by the header read
* function.
*/
- error = -EFSCORRUPTED;
- if (!pag->pagi_init || !pag->pagf_init)
+ if (!pag->pagi_init || !pag->pagf_init) {
+ error = -EFSCORRUPTED;
break;
+ }
xfs_buf_relse(agf_bp);
agf_bp = NULL;
xfs_buf_relse(agi_bp);
agi_bp = NULL;
-next_loop_perag:
- xfs_perag_put(pag);
- pag = NULL;
- error = 0;
-
- if (xchk_should_terminate(sc, &error))
- break;
}
if (agf_bp)
@@ -116,8 +111,7 @@ next_loop_perag:
int
xchk_setup_fscounters(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
struct xchk_fscounters *fsc;
int error;
@@ -144,6 +138,35 @@ xchk_setup_fscounters(
return xchk_trans_alloc(sc, 0);
}
+/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
+static int
+xchk_fscount_btreeblks(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc,
+ xfs_agnumber_t agno)
+{
+ xfs_extlen_t blocks;
+ int error;
+
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
+ if (error)
+ goto out_free;
+
+ error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
+ if (error)
+ goto out_free;
+ fsc->fdblocks += blocks - 1;
+
+ error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
+ if (error)
+ goto out_free;
+ fsc->fdblocks += blocks - 1;
+
+out_free:
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
/*
* Calculate what the global in-core counters ought to be from the incore
* per-AG structure. Callers can compare this to the actual in-core counters
@@ -167,13 +190,14 @@ retry:
fsc->ifree = 0;
fsc->fdblocks = 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
+ if (xchk_should_terminate(sc, &error))
+ break;
/* This somehow got unset since the warmup? */
if (!pag->pagi_init || !pag->pagf_init) {
- xfs_perag_put(pag);
- return -EFSCORRUPTED;
+ error = -EFSCORRUPTED;
+ break;
}
/* Count all the inodes */
@@ -183,7 +207,13 @@ retry:
/* Add up the free/freelist/bnobt/cntbt blocks */
fsc->fdblocks += pag->pagf_freeblks;
fsc->fdblocks += pag->pagf_flcount;
- fsc->fdblocks += pag->pagf_btreeblks;
+ if (xfs_has_lazysbcount(sc->mp)) {
+ fsc->fdblocks += pag->pagf_btreeblks;
+ } else {
+ error = xchk_fscount_btreeblks(sc, fsc, agno);
+ if (error)
+ break;
+ }
/*
* Per-AG reservations are taken out of the incore counters,
@@ -192,12 +222,9 @@ retry:
fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
- xfs_perag_put(pag);
-
- if (xchk_should_terminate(sc, &error))
- break;
}
-
+ if (pag)
+ xfs_perag_put(pag);
if (error)
return error;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 83d27cdf579b..aa65ec88a0c0 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -8,7 +8,9 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_btree.h"
-#include "xfs_sb.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
@@ -133,7 +135,8 @@ xchk_update_health(
if (!sc->sick_mask)
return;
- bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT);
+ bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT));
switch (type_to_health_flag[sc->sm->sm_type].group) {
case XHG_AG:
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 681758704fda..e312be7cd375 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -21,6 +21,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub inode btrees.
@@ -29,10 +30,9 @@
*/
int
xchk_setup_ag_iallocbt(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER);
+ return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER);
}
/* Inode btree scrubber. */
@@ -104,13 +104,12 @@ xchk_iallocbt_chunk(
xfs_extlen_t len)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+
+ if (!xfs_verify_agbext(pag, bno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
@@ -164,7 +163,7 @@ xchk_iallocbt_check_cluster_ifree(
* the record, compute which fs inode we're talking about.
*/
agino = irec->ir_startino + irec_ino;
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino);
irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
@@ -212,10 +211,9 @@ xchk_iallocbt_check_cluster(
{
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
- struct xfs_dinode *dip;
struct xfs_buf *cluster_bp;
unsigned int nr_inodes;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t agbno;
unsigned int cluster_index;
uint16_t cluster_mask = 0;
@@ -278,8 +276,7 @@ xchk_iallocbt_check_cluster(
&XFS_RMAP_OINFO_INODES);
/* Grab the inode cluster buffer. */
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
- 0, 0);
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &cluster_bp);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
return error;
@@ -420,13 +417,13 @@ xchk_iallocbt_rec_alignment(
STATIC int
xchk_iallocbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
xfs_extlen_t len;
int holecount;
@@ -448,8 +445,8 @@ xchk_iallocbt_rec(
agino = irec.ir_startino;
/* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(mp, agno, agino) ||
- !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+ if (!xfs_verify_agino(pag, agino) ||
+ !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
@@ -519,7 +516,7 @@ xchk_iallocbt_xref_rmap_btreeblks(
int error;
if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
- (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) ||
+ (xfs_has_finobt(sc->mp) && !sc->sa.fino_cur) ||
xchk_skip_xref(sc->sm))
return;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6d483ab29e63..51820b40ab1c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -28,8 +28,7 @@
*/
int
xchk_setup_inode(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
int error;
@@ -37,7 +36,7 @@ xchk_setup_inode(
* Try to get the inode. If the verifiers fail, we try again
* in raw mode.
*/
- error = xchk_get_inode(sc, ip);
+ error = xchk_get_inode(sc);
switch (error) {
case 0:
break;
@@ -74,11 +73,25 @@ xchk_inode_extsize(
uint16_t flags)
{
xfs_failaddr_t fa;
+ uint32_t value = be32_to_cpu(dip->di_extsize);
- fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize),
- mode, flags);
+ fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags);
if (fa)
xchk_ino_set_corrupt(sc, ino);
+
+ /*
+ * XFS allows a sysadmin to change the rt extent size when adding a rt
+ * section to a filesystem after formatting. If there are any
+ * directories with extszinherit and rtinherit set, the hint could
+ * become misaligned with the new rextsize. The verifier doesn't check
+ * this, because we allow rtinherit directories even without an rt
+ * device. Flag this as an administrative warning since we will clean
+ * this up eventually.
+ */
+ if ((flags & XFS_DIFLAG_RTINHERIT) &&
+ (flags & XFS_DIFLAG_EXTSZINHERIT) &&
+ value % sc->mp->m_sb.sb_rextsize > 0)
+ xchk_ino_set_warning(sc, ino);
}
/*
@@ -121,8 +134,7 @@ xchk_inode_flags(
goto bad;
/* rt flags require rt device */
- if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) &&
- !mp->m_rtdev_targp)
+ if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
goto bad;
/* new rt bitmap flag only valid for rbmino */
@@ -169,7 +181,7 @@ xchk_inode_flags2(
/* reflink flag requires reflink feature */
if ((flags2 & XFS_DIFLAG2_REFLINK) &&
- !xfs_sb_version_hasreflink(&mp->m_sb))
+ !xfs_has_reflink(mp))
goto bad;
/* cowextsize flag is checked w.r.t. mode separately */
@@ -186,8 +198,8 @@ xchk_inode_flags2(
if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
goto bad;
- /* dax and reflink make no sense, currently */
- if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
+ /* no bigtime iflag without the bigtime feature */
+ if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
goto bad;
return;
@@ -195,6 +207,20 @@ bad:
xchk_ino_set_corrupt(sc, ino);
}
+static inline void
+xchk_dinode_nsec(
+ struct xfs_scrub *sc,
+ xfs_ino_t ino,
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+
+ tv = xfs_inode_from_disk_ts(dip, ts);
+ if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC)
+ xchk_ino_set_corrupt(sc, ino);
+}
+
/* Scrub all the ondisk inode fields. */
STATIC void
xchk_dinode(
@@ -206,7 +232,9 @@ xchk_dinode(
size_t fork_recs;
unsigned long long isize;
uint64_t flags2;
- uint32_t nextents;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ prid_t prid;
uint16_t flags;
uint16_t mode;
@@ -241,6 +269,7 @@ xchk_dinode(
* so just mark this inode for preening.
*/
xchk_ino_set_preen(sc, ino);
+ prid = 0;
break;
case 2:
case 3:
@@ -251,14 +280,19 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
if (dip->di_projid_hi != 0 &&
- !xfs_sb_version_hasprojid32bit(&mp->m_sb))
+ !xfs_has_projid32(mp))
xchk_ino_set_corrupt(sc, ino);
+
+ prid = be16_to_cpu(dip->di_projid_lo);
break;
default:
xchk_ino_set_corrupt(sc, ino);
return;
}
+ if (xfs_has_projid32(mp))
+ prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16;
+
/*
* di_uid/di_gid -- -1 isn't invalid, but there's no way that
* userspace could have created that.
@@ -267,6 +301,13 @@ xchk_dinode(
dip->di_gid == cpu_to_be32(-1U))
xchk_ino_set_warning(sc, ino);
+ /*
+ * project id of -1 isn't supposed to be valid, but the kernel didn't
+ * always validate that.
+ */
+ if (prid == -1U)
+ xchk_ino_set_warning(sc, ino);
+
/* di_format */
switch (dip->di_format) {
case XFS_DINODE_FMT_DEV:
@@ -293,12 +334,9 @@ xchk_dinode(
}
/* di_[amc]time.nsec */
- if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_atime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_mtime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_ctime);
/*
* di_size. xfs_dinode_verify checks for things that screw up
@@ -353,8 +391,10 @@ xchk_dinode(
xchk_inode_extsize(sc, dip, ino, mode, flags);
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+
/* di_nextents */
- nextents = be32_to_cpu(dip->di_nextents);
fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -374,7 +414,7 @@ xchk_dinode(
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
- if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
xchk_ino_set_corrupt(sc, ino);
@@ -386,25 +426,23 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
/* di_anextents */
- nextents = be16_to_cpu(dip->di_anextents);
fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- if (nextents > fork_recs)
+ if (naextents > fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
- if (nextents <= fork_recs)
+ if (naextents <= fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
default:
- if (nextents != 0)
+ if (naextents != 0)
xchk_ino_set_corrupt(sc, ino);
}
if (dip->di_version >= 3) {
- if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_crtime);
xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
xchk_inode_cowextsize(sc, dip, ino, mode, flags,
flags2);
@@ -477,14 +515,14 @@ xchk_inode_xref_bmap(
&nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents < be32_to_cpu(dip->di_nextents))
+ if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents != be16_to_cpu(dip->di_anextents))
+ if (nextents != xfs_dfork_attr_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
@@ -509,9 +547,9 @@ xchk_inode_xref(
agno = XFS_INO_TO_AGNO(sc->mp, ino);
agbno = XFS_INO_TO_AGBNO(sc->mp, ino);
- error = xchk_ag_init(sc, agno, &sc->sa);
+ error = xchk_ag_init_existing(sc, agno, &sc->sa);
if (!xchk_xref_process_error(sc, agno, agbno, &error))
- return;
+ goto out_free;
xchk_xref_is_used_space(sc, agbno, 1);
xchk_inode_xref_finobt(sc, ino);
@@ -519,6 +557,7 @@ xchk_inode_xref(
xchk_xref_is_not_shared(sc, agbno, 1);
xchk_inode_xref_bmap(sc, dip);
+out_free:
xchk_ag_free(sc, &sc->sa);
}
@@ -537,7 +576,7 @@ xchk_inode_check_reflink_iflag(
bool has_shared;
int error;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return;
error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 5705adc43a75..d8dff3fd8053 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -20,10 +20,9 @@
/* Set us up to scrub parents. */
int
xchk_setup_parent(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- return xchk_setup_inode_contents(sc, ip, 0);
+ return xchk_setup_inode_contents(sc, 0);
}
/* Parent pointers */
@@ -39,7 +38,7 @@ struct xchk_parent_ctx {
};
/* Look for a single entry in a directory pointing to an inode. */
-STATIC int
+STATIC bool
xchk_parent_actor(
struct dir_context *dc,
const char *name,
@@ -63,7 +62,7 @@ xchk_parent_actor(
if (xchk_should_terminate(spc->sc, &error))
spc->cancelled = true;
- return error;
+ return !error;
}
/* Count the number of dentries in the parent dir that point to this inode. */
@@ -90,7 +89,7 @@ xchk_parent_count_parent_dentries(
* if there is one.
*/
lock_mode = xfs_ilock_data_map_shared(parent);
- if (parent->i_d.di_nextents > 0)
+ if (parent->i_df.if_nextents > 0)
error = xfs_dir3_data_readahead(parent, 0, 0);
xfs_iunlock(parent, lock_mode);
if (error)
@@ -102,7 +101,7 @@ xchk_parent_count_parent_dentries(
* scanned.
*/
bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
- parent->i_d.di_size);
+ parent->i_disk_size);
oldpos = 0;
while (true) {
error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
@@ -164,13 +163,13 @@ xchk_parent_validate(
* can't use DONTCACHE here because DONTCACHE inodes can trigger
* immediate inactive cleanup of the inode.
*
- * If _iget returns -EINVAL then the parent inode number is garbage
- * and the directory is corrupt. If the _iget returns -EFSCORRUPTED
- * or -EFSBADCRC then the parent is corrupt which is a cross
- * referencing error. Any other error is an operational error.
+ * If _iget returns -EINVAL or -ENOENT then the parent inode number is
+ * garbage and the directory is corrupt. If the _iget returns
+ * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a
+ * cross referencing error. Any other error is an operational error.
*/
error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp);
- if (error == -EINVAL) {
+ if (error == -EINVAL || error == -ENOENT) {
error = -EFSCORRUPTED;
xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
goto out;
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 905a34558361..21b4c9006859 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -18,17 +18,17 @@
#include "scrub/common.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline uint
+static inline xfs_dqtype_t
xchk_quota_to_dqtype(
struct xfs_scrub *sc)
{
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_UQUOTA:
- return XFS_DQ_USER;
+ return XFS_DQTYPE_USER;
case XFS_SCRUB_TYPE_GQUOTA:
- return XFS_DQ_GROUP;
+ return XFS_DQTYPE_GROUP;
case XFS_SCRUB_TYPE_PQUOTA:
- return XFS_DQ_PROJ;
+ return XFS_DQTYPE_PROJ;
default:
return 0;
}
@@ -37,23 +37,22 @@ xchk_quota_to_dqtype(
/* Set us up to scrub a quota. */
int
xchk_setup_quota(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- uint dqtype;
+ xfs_dqtype_t dqtype;
int error;
- if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
+ if (!XFS_IS_QUOTA_ON(sc->mp))
return -ENOENT;
dqtype = xchk_quota_to_dqtype(sc);
if (dqtype == 0)
return -EINVAL;
- sc->flags |= XCHK_HAS_QUOTAOFFLOCK;
- mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
+
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
- error = xchk_setup_fs(sc, ip);
+
+ error = xchk_setup_fs(sc);
if (error)
return error;
sc->ip = xfs_quota_inode(sc->mp, dqtype);
@@ -73,56 +72,29 @@ struct xchk_quota_info {
STATIC int
xchk_quota_item(
struct xfs_dquot *dq,
- uint dqtype,
+ xfs_dqtype_t dqtype,
void *priv)
{
struct xchk_quota_info *sqi = priv;
struct xfs_scrub *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
- struct xfs_disk_dquot *d = &dq->q_core;
struct xfs_quotainfo *qi = mp->m_quotainfo;
xfs_fileoff_t offset;
- unsigned long long bsoft;
- unsigned long long isoft;
- unsigned long long rsoft;
- unsigned long long bhard;
- unsigned long long ihard;
- unsigned long long rhard;
- unsigned long long bcount;
- unsigned long long icount;
- unsigned long long rcount;
xfs_ino_t fs_icount;
- xfs_dqid_t id = be32_to_cpu(d->d_id);
int error = 0;
if (xchk_should_terminate(sc, &error))
- return error;
+ return -ECANCELED;
/*
* Except for the root dquot, the actual dquot we got must either have
* the same or higher id as we saw before.
*/
- offset = id / qi->qi_dqperchunk;
- if (id && id <= sqi->last_id)
+ offset = dq->q_id / qi->qi_dqperchunk;
+ if (dq->q_id && dq->q_id <= sqi->last_id)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- sqi->last_id = id;
-
- /* Did we get the dquot type we wanted? */
- if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-
- if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0))
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-
- /* Check the limits. */
- bhard = be64_to_cpu(d->d_blk_hardlimit);
- ihard = be64_to_cpu(d->d_ino_hardlimit);
- rhard = be64_to_cpu(d->d_rtb_hardlimit);
-
- bsoft = be64_to_cpu(d->d_blk_softlimit);
- isoft = be64_to_cpu(d->d_ino_softlimit);
- rsoft = be64_to_cpu(d->d_rtb_softlimit);
+ sqi->last_id = dq->q_id;
/*
* Warn if the hard limits are larger than the fs.
@@ -132,25 +104,22 @@ xchk_quota_item(
* Complain about corruption if the soft limit is greater than
* the hard limit.
*/
- if (bhard > mp->m_sb.sb_dblocks)
+ if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (bsoft > bhard)
+ if (dq->q_blk.softlimit > dq->q_blk.hardlimit)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- if (ihard > M_IGEO(mp)->maxicount)
+ if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (isoft > ihard)
+ if (dq->q_ino.softlimit > dq->q_ino.hardlimit)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- if (rhard > mp->m_sb.sb_rblocks)
+ if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (rsoft > rhard)
+ if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/* Check the resource counts. */
- bcount = be64_to_cpu(d->d_bcount);
- icount = be64_to_cpu(d->d_icount);
- rcount = be64_to_cpu(d->d_rtbcount);
fs_icount = percpu_counter_sum(&mp->m_icount);
/*
@@ -158,16 +127,16 @@ xchk_quota_item(
* a reflink filesystem we're allowed to exceed physical space
* if there are no quota limits.
*/
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- if (mp->m_sb.sb_dblocks < bcount)
+ if (xfs_has_reflink(mp)) {
+ if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_warning(sc, XFS_DATA_FORK,
offset);
} else {
- if (mp->m_sb.sb_dblocks < bcount)
+ if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
offset);
}
- if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks)
+ if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/*
@@ -175,15 +144,24 @@ xchk_quota_item(
* lower limit than the actual usage. However, we flag it for
* admin review.
*/
- if (id != 0 && bhard != 0 && bcount > bhard)
+ if (dq->q_id == 0)
+ goto out;
+
+ if (dq->q_blk.hardlimit != 0 &&
+ dq->q_blk.count > dq->q_blk.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (id != 0 && ihard != 0 && icount > ihard)
+
+ if (dq->q_ino.hardlimit != 0 &&
+ dq->q_ino.count > dq->q_ino.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (id != 0 && rhard != 0 && rcount > rhard)
+
+ if (dq->q_rtb.hardlimit != 0 &&
+ dq->q_rtb.count > dq->q_rtb.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return -EFSCORRUPTED;
+ return -ECANCELED;
return 0;
}
@@ -207,7 +185,7 @@ xchk_quota_data_fork(
/* Check for data fork problems that apply only to quota files. */
max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
- ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
break;
@@ -235,7 +213,7 @@ xchk_quota(
struct xchk_quota_info sqi;
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
- uint dqtype;
+ xfs_dqtype_t dqtype;
int error = 0;
dqtype = xchk_quota_to_dqtype(sc);
@@ -259,6 +237,8 @@ xchk_quota(
error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
sc->ilock_flags = XFS_ILOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
+ if (error == -ECANCELED)
+ error = 0;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
sqi.last_id * qi->qi_dqperchunk, &error))
goto out;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 0cab11a5d390..a26ee0f24ef2 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -13,16 +13,18 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub reference count btrees.
*/
int
xchk_setup_ag_refcountbt(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- return xchk_setup_ag_btree(sc, ip, false);
+ return xchk_setup_ag_btree(sc, false);
}
/* Reference count btree scrubber. */
@@ -91,7 +93,7 @@ struct xchk_refcnt_check {
STATIC int
xchk_refcountbt_rmap_check(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xchk_refcnt_check *refchk = priv;
@@ -170,7 +172,6 @@ xchk_refcountbt_process_rmap_fragments(
*/
INIT_LIST_HEAD(&worklist);
rbno = NULLAGBLOCK;
- nr = 1;
/* Make sure the fragments actually /are/ in agbno order. */
bno = 0;
@@ -184,15 +185,14 @@ xchk_refcountbt_process_rmap_fragments(
* Find all the rmaps that start at or before the refc extent,
* and put them on the worklist.
*/
+ nr = 0;
list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
- if (frag->rm.rm_startblock > refchk->bno)
- goto done;
+ if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
+ break;
bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
if (bno < rbno)
rbno = bno;
list_move_tail(&frag->list, &worklist);
- if (nr == target_nr)
- break;
nr++;
}
@@ -269,15 +269,13 @@ done:
STATIC void
xchk_refcountbt_xref_rmap(
struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- xfs_nlink_t refcount)
+ const struct xfs_refcount_irec *irec)
{
struct xchk_refcnt_check refchk = {
- .sc = sc,
- .bno = bno,
- .len = len,
- .refcount = refcount,
+ .sc = sc,
+ .bno = irec->rc_startblock,
+ .len = irec->rc_blockcount,
+ .refcount = irec->rc_refcount,
.seen = 0,
};
struct xfs_rmap_irec low;
@@ -291,9 +289,9 @@ xchk_refcountbt_xref_rmap(
/* Cross-reference with the rmapbt to confirm the refcount. */
memset(&low, 0, sizeof(low));
- low.rm_startblock = bno;
+ low.rm_startblock = irec->rc_startblock;
memset(&high, 0xFF, sizeof(high));
- high.rm_startblock = bno + len - 1;
+ high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
INIT_LIST_HEAD(&refchk.fragments);
error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
@@ -302,7 +300,7 @@ xchk_refcountbt_xref_rmap(
goto out_free;
xchk_refcountbt_process_rmap_fragments(&refchk);
- if (refcount != refchk.seen)
+ if (irec->rc_refcount != refchk.seen)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
out_free:
@@ -315,55 +313,45 @@ out_free:
/* Cross-reference with the other btrees. */
STATIC void
xchk_refcountbt_xref(
- struct xfs_scrub *sc,
- xfs_agblock_t agbno,
- xfs_extlen_t len,
- xfs_nlink_t refcount)
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *irec)
{
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
- xchk_xref_is_used_space(sc, agbno, len);
- xchk_xref_is_not_inode_chunk(sc, agbno, len);
- xchk_refcountbt_xref_rmap(sc, agbno, len, refcount);
+ xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount);
+ xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock,
+ irec->rc_blockcount);
+ xchk_refcountbt_xref_rmap(sc, irec);
}
/* Scrub a refcountbt record. */
STATIC int
xchk_refcountbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_refcount_irec irec;
xfs_agblock_t *cow_blocks = bs->private;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
- xfs_agblock_t bno;
- xfs_extlen_t len;
- xfs_nlink_t refcount;
- bool has_cowflag;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
- bno = be32_to_cpu(rec->refc.rc_startblock);
- len = be32_to_cpu(rec->refc.rc_blockcount);
- refcount = be32_to_cpu(rec->refc.rc_refcount);
+ xfs_refcount_btrec_to_irec(rec, &irec);
- /* Only CoW records can have refcount == 1. */
- has_cowflag = (bno & XFS_REFC_COW_START);
- if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag))
+ /* Check the domain and refcount are not incompatible. */
+ if (!xfs_refcount_check_domain(&irec))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (has_cowflag)
- (*cow_blocks) += len;
+
+ if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
+ (*cow_blocks) += irec.rc_blockcount;
/* Check the extent. */
- bno &= ~XFS_REFC_COW_START;
- if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (refcount == 0)
+ if (irec.rc_refcount == 0)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- xchk_refcountbt_xref(bs->sc, bno, len, refcount);
+ xchk_refcountbt_xref(bs->sc, &irec);
return 0;
}
@@ -427,7 +415,6 @@ xchk_xref_is_cow_staging(
xfs_extlen_t len)
{
struct xfs_refcount_irec rc;
- bool has_cowflag;
int has_refcount;
int error;
@@ -435,8 +422,8 @@ xchk_xref_is_cow_staging(
return;
/* Find the CoW staging extent. */
- error = xfs_refcount_lookup_le(sc->sa.refc_cur,
- agbno + XFS_REFC_COW_START, &has_refcount);
+ error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
+ agbno, &has_refcount);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
if (!has_refcount) {
@@ -452,9 +439,8 @@ xchk_xref_is_cow_staging(
return;
}
- /* CoW flag must be set, refcount must be 1. */
- has_cowflag = (rc.rc_startblock & XFS_REFC_COW_START);
- if (!has_cowflag || rc.rc_refcount != 1)
+ /* CoW lookup returned a shared extent record? */
+ if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
/* Must be at least as long as what was passed in */
@@ -478,7 +464,8 @@ xchk_xref_is_not_shared(
if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
+ error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED,
+ agbno, len, &shared);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
if (shared)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index e489d7a8446a..c18bd039fce9 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -22,8 +22,10 @@
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_quota.h"
+#include "xfs_qm.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -37,19 +39,18 @@
*/
int
xrep_attempt(
- struct xfs_inode *ip,
struct xfs_scrub *sc)
{
int error = 0;
- trace_xrep_attempt(ip, sc->sm, error);
+ trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
xchk_ag_btcur_free(&sc->sa);
/* Repair whatever's broken. */
ASSERT(sc->ops->repair);
error = sc->ops->repair(sc);
- trace_xrep_done(ip, sc->sm, error);
+ trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
switch (error) {
case 0:
/*
@@ -198,7 +199,7 @@ xrep_calc_ag_resblks(
icount = pag->pagi_count;
} else {
/* Try to get the actual counters from disk. */
- error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ error = xfs_ialloc_read_agi(pag, NULL, &bp);
if (!error) {
icount = pag->pagi_count;
xfs_buf_relse(bp);
@@ -206,32 +207,35 @@ xrep_calc_ag_resblks(
}
/* Now grab the block counters from the AGF. */
- error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
- if (!error) {
- aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
- freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
+ if (error) {
+ aglen = pag->block_count;
+ freelen = aglen;
+ usedlen = aglen;
+ } else {
+ struct xfs_agf *agf = bp->b_addr;
+
+ aglen = be32_to_cpu(agf->agf_length);
+ freelen = be32_to_cpu(agf->agf_freeblks);
usedlen = aglen - freelen;
xfs_buf_relse(bp);
}
- xfs_perag_put(pag);
/* If the icount is impossible, make some worst-case assumptions. */
if (icount == NULLAGINO ||
- !xfs_verify_agino(mp, sm->sm_agno, icount)) {
- xfs_agino_t first, last;
-
- xfs_agino_range(mp, sm->sm_agno, &first, &last);
- icount = last - first + 1;
+ !xfs_verify_agino(pag, icount)) {
+ icount = pag->agino_max - pag->agino_min + 1;
}
/* If the block counts are impossible, make worst-case assumptions. */
if (aglen == NULLAGBLOCK ||
- aglen != xfs_ag_block_count(mp, sm->sm_agno) ||
+ aglen != pag->block_count ||
freelen >= aglen) {
- aglen = xfs_ag_block_count(mp, sm->sm_agno);
+ aglen = pag->block_count;
freelen = aglen;
usedlen = aglen;
}
+ xfs_perag_put(pag);
trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
freelen, usedlen);
@@ -242,19 +246,19 @@ xrep_calc_ag_resblks(
* bnobt/cntbt or inobt/finobt as pairs.
*/
bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
- if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ if (xfs_has_sparseinodes(mp))
inobt_sz = xfs_iallocbt_calc_size(mp, icount /
XFS_INODES_PER_HOLEMASK_BIT);
else
inobt_sz = xfs_iallocbt_calc_size(mp, icount /
XFS_INODES_PER_CHUNK);
- if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ if (xfs_has_finobt(mp))
inobt_sz *= 2;
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
else
refcbt_sz = 0;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (xfs_has_rmapbt(mp)) {
/*
* Guess how many blocks we need to rebuild the rmapbt.
* For non-reflink filesystems we can't have more records than
@@ -263,7 +267,7 @@ xrep_calc_ag_resblks(
* many rmaps there could be in the AG, so we start off with
* what we hope is an generous over-estimation.
*/
- if (xfs_sb_version_hasreflink(&mp->m_sb))
+ if (xfs_has_reflink(mp))
rmapbt_sz = xfs_rmapbt_calc_size(mp,
(unsigned long long)aglen * 2);
else
@@ -293,16 +297,16 @@ xrep_alloc_ag_block(
switch (resv) {
case XFS_AG_RESV_AGFL:
case XFS_AG_RESV_RMAPBT:
- error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+ error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
+ sc->sa.agf_bp, &bno, 1);
if (error)
return error;
if (bno == NULLAGBLOCK)
return -ENOSPC;
- xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
- 1, false);
- *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+ xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
+ *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
if (resv == XFS_AG_RESV_RMAPBT)
- xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
+ xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
return 0;
default:
break;
@@ -311,7 +315,7 @@ xrep_alloc_ag_block(
args.tp = sc->tp;
args.mp = sc->mp;
args.oinfo = *oinfo;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
@@ -346,14 +350,14 @@ xrep_init_btblock(
trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), btnum);
- ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+ ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
&bp);
if (error)
return error;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
bp->b_ops = ops;
@@ -434,10 +438,10 @@ xrep_init_btblock(
int
xrep_invalidate_blocks(
struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap)
+ struct xbitmap *bitmap)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
@@ -449,17 +453,20 @@ xrep_invalidate_blocks(
* because we never own those; and if we can't TRYLOCK the buffer we
* assume it's owned by someone else.
*/
- for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) {
+ for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
+ int error;
+
/* Skip AG headers and post-EOFS blocks */
if (!xfs_verify_fsbno(sc->mp, fsbno))
continue;
- bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+ error = xfs_buf_incore(sc->mp->m_ddev_targp,
XFS_FSB_TO_DADDR(sc->mp, fsbno),
- XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
- if (bp) {
- xfs_trans_bjoin(sc->tp, bp);
- xfs_trans_binval(sc->tp, bp);
- }
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+ if (error)
+ continue;
+
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
}
return 0;
@@ -475,7 +482,7 @@ xrep_fix_freelist(
args.mp = sc->mp;
args.tp = sc->tp;
- args.agno = sc->sa.agno;
+ args.agno = sc->sa.pag->pag_agno;
args.alignment = 1;
args.pag = sc->sa.pag;
@@ -503,17 +510,17 @@ xrep_put_freelist(
* create an rmap for the block prior to merging it or else other
* parts will break.
*/
- error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
&XFS_RMAP_OINFO_AG);
if (error)
return error;
/* Put the block on the AGFL. */
- error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
- agbno, 0);
+ error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+ sc->sa.agfl_bp, agbno, 0);
if (error)
return error;
- xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+ xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
@@ -529,13 +536,12 @@ xrep_reap_block(
{
struct xfs_btree_cur *cur;
struct xfs_buf *agf_bp = NULL;
- xfs_agnumber_t agno;
xfs_agblock_t agbno;
bool has_other_rmap;
int error;
- agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
/*
* If we are repairing per-inode metadata, we need to read in the AGF
@@ -543,13 +549,13 @@ xrep_reap_block(
* the AGF buffer that the setup functions already grabbed.
*/
if (sc->ip) {
- error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
} else {
agf_bp = sc->sa.agf_bp;
}
- cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
/* Can we find any other rmappings? */
error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
@@ -571,7 +577,8 @@ xrep_reap_block(
* to run xfs_repair.
*/
if (has_other_rmap)
- error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+ error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
+ 1, oinfo);
else if (resv == XFS_AG_RESV_AGFL)
error = xrep_put_freelist(sc, agbno);
else
@@ -595,31 +602,29 @@ out_free:
int
xrep_reap_extents(
struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap,
+ struct xbitmap *bitmap,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
xfs_fsblock_t fsbno;
int error = 0;
- ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
+ ASSERT(xfs_has_rmapbt(sc->mp));
- for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) {
+ for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
ASSERT(sc->ip != NULL ||
- XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno);
+ XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
trace_xrep_dispose_btree_extent(sc->mp,
XFS_FSB_TO_AGNO(sc->mp, fsbno),
XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
error = xrep_reap_block(sc, fsbno, oinfo, type);
if (error)
- goto out;
+ break;
}
-out:
- xfs_bitmap_destroy(bitmap);
return error;
}
@@ -685,7 +690,7 @@ xrep_findroot_block(
int block_level;
int error = 0;
- daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+ daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
/*
* Blocks in the AGFL have stale contents that might just happen to
@@ -814,7 +819,7 @@ xrep_findroot_block(
else
fab->root = NULLAGBLOCK;
- trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
+ trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
out:
xfs_trans_brelse(ri->sc->tp, bp);
@@ -828,7 +833,7 @@ out:
STATIC int
xrep_findroot_rmap(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xrep_findroot *ri = priv;
@@ -879,7 +884,7 @@ xrep_find_ag_btree_roots(
ri.sc = sc;
ri.btree_info = btree_info;
- ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agf = agf_bp->b_addr;
ri.agfl_bp = agfl_bp;
for (fab = btree_info; fab->buf_ops; fab++) {
ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
@@ -888,7 +893,7 @@ xrep_find_ag_btree_roots(
fab->height = 0;
}
- cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
xfs_btree_del_cursor(cur, error);
@@ -899,19 +904,21 @@ xrep_find_ag_btree_roots(
void
xrep_force_quotacheck(
struct xfs_scrub *sc,
- uint dqtype)
+ xfs_dqtype_t type)
{
uint flag;
- flag = xfs_quota_chkd_flag(dqtype);
+ flag = xfs_quota_chkd_flag(type);
if (!(flag & sc->mp->m_qflags))
return;
+ mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
sc->mp->m_qflags &= ~flag;
spin_lock(&sc->mp->m_sb_lock);
sc->mp->m_sb.sb_qflags &= ~flag;
spin_unlock(&sc->mp->m_sb_lock);
xfs_log_sb(sc->tp);
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
}
/*
@@ -939,12 +946,12 @@ xrep_ino_dqattach(
"inode %llu repair encountered quota error %d, quotacheck forced.",
(unsigned long long)sc->ip->i_ino, error);
if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
- xrep_force_quotacheck(sc, XFS_DQ_USER);
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
- xrep_force_quotacheck(sc, XFS_DQ_GROUP);
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
- xrep_force_quotacheck(sc, XFS_DQ_PROJ);
- /* fall through */
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+ fallthrough;
case -ESRCH:
error = 0;
break;
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index c3422403b169..840f74ec431c 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -6,6 +6,8 @@
#ifndef __XFS_SCRUB_REPAIR_H__
#define __XFS_SCRUB_REPAIR_H__
+#include "xfs_quota_defs.h"
+
static inline int xrep_notsupported(struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
@@ -15,7 +17,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
/* Repair helpers */
-int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc);
+int xrep_attempt(struct xfs_scrub *sc);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
@@ -28,11 +30,11 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
-struct xfs_bitmap;
+struct xbitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
-int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist);
-int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist,
+int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
+int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
@@ -42,6 +44,9 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
+ /* in: maximum btree height */
+ unsigned int maxlevels;
+
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
@@ -49,7 +54,7 @@ struct xrep_find_ag_btree {
int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
-void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype);
+void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
/* Metadata repairers */
@@ -62,8 +67,8 @@ int xrep_agi(struct xfs_scrub *sc);
#else
-static inline int xrep_attempt(
- struct xfs_inode *ip,
+static inline int
+xrep_attempt(
struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8d4cefd761c1..229826b2e1c0 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -15,16 +15,16 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_ag.h"
/*
* Set us up to scrub reverse mapping btrees.
*/
int
xchk_setup_ag_rmapbt(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
- return xchk_setup_ag_btree(sc, ip, false);
+ return xchk_setup_ag_btree(sc, false);
}
/* Reverse-mapping scrubber. */
@@ -88,11 +88,11 @@ xchk_rmapbt_xref(
STATIC int
xchk_rmapbt_rec(
struct xchk_btree *bs,
- union xfs_btree_rec *rec)
+ const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
bool non_inode;
bool is_unwritten;
bool is_bmbt;
@@ -121,8 +121,8 @@ xchk_rmapbt_rec(
* Otherwise we must point somewhere past the static metadata
* but before the end of the FS. Run the regular check.
*/
- if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
- !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+ if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
+ !xfs_verify_agbno(pag, irec.rm_startblock +
irec.rm_blockcount - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index c642bc206c41..0a3bde64c675 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -13,18 +13,18 @@
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
+#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
/* Set us up with the realtime metadata locked. */
int
xchk_setup_rt(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
int error;
- error = xchk_setup_fs(sc, ip);
+ error = xchk_setup_fs(sc);
if (error)
return error;
@@ -40,24 +40,58 @@ xchk_setup_rt(
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
+ const struct xfs_rtalloc_rec *rec,
void *priv)
{
struct xfs_scrub *sc = priv;
xfs_rtblock_t startblock;
xfs_rtblock_t blockcount;
- startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+ startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
- if (startblock + blockcount <= startblock ||
- !xfs_verify_rtbno(sc->mp, startblock) ||
- !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1))
+ if (!xfs_verify_rtext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
+/* Make sure the entire rtbitmap file is mapped with written extents. */
+STATIC int
+xchk_rtbitmap_check_extents(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_bmbt_irec map;
+ xfs_rtblock_t off;
+ int nmap;
+ int error = 0;
+
+ for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+ if (xchk_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ /* Make sure we have a written extent. */
+ nmap = 1;
+ error = xfs_bmapi_read(mp->m_rbmip, off,
+ mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+ XFS_DATA_FORK);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+ break;
+
+ if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ break;
+ }
+
+ off += map.br_blockcount;
+ }
+
+ return error;
+}
+
/* Scrub the realtime bitmap. */
int
xchk_rtbitmap(
@@ -65,12 +99,23 @@ xchk_rtbitmap(
{
int error;
+ /* Is the size of the rtbitmap correct? */
+ if (sc->mp->m_rbmip->i_disk_size !=
+ XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
+ xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+ return 0;
+ }
+
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
+ error = xchk_rtbitmap_check_extents(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index f1775bb19313..2e8e400f10a9 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -149,9 +149,10 @@ xchk_probe(
STATIC int
xchk_teardown(
struct xfs_scrub *sc,
- struct xfs_inode *ip_in,
int error)
{
+ struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
+
xchk_ag_free(sc, &sc->sa);
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
@@ -168,12 +169,10 @@ xchk_teardown(
xfs_irele(sc->ip);
sc->ip = NULL;
}
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ mnt_drop_write_file(sc->file);
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
- if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
- mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
- sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
- }
if (sc->buf) {
kmem_free(sc->buf);
sc->buf = NULL;
@@ -236,21 +235,21 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
.scrub = xchk_finobt,
- .has = xfs_sb_version_hasfinobt,
+ .has = xfs_has_finobt,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_rmapbt,
.scrub = xchk_rmapbt,
- .has = xfs_sb_version_hasrmapbt,
+ .has = xfs_has_rmapbt,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_refcountbt,
.scrub = xchk_refcountbt,
- .has = xfs_sb_version_hasreflink,
+ .has = xfs_has_reflink,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
@@ -305,14 +304,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rt,
.scrub = xchk_rtbitmap,
- .has = xfs_sb_version_hasrealtime,
+ .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xchk_setup_rt,
.scrub = xchk_rtsummary,
- .has = xfs_sb_version_hasrealtime,
+ .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
@@ -341,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
};
-/* This isn't a stable feature, warn once per day. */
-static inline void
-xchk_experimental_warning(
- struct xfs_mount *mp)
-{
- static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
- "xchk_warning", 86400 * HZ, 1);
- ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
-
- if (__ratelimit(&scrub_warning))
- xfs_alert(mp,
-"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
-}
-
static int
xchk_validate_inputs(
struct xfs_mount *mp,
@@ -380,7 +365,7 @@ xchk_validate_inputs(
if (ops->setup == NULL || ops->scrub == NULL)
goto out;
/* Does this fs even support this type of metadata? */
- if (ops->has && !ops->has(&mp->m_sb))
+ if (ops->has && !ops->has(mp))
goto out;
error = -EINVAL;
@@ -412,11 +397,11 @@ xchk_validate_inputs(
*/
if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
error = -EOPNOTSUPP;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_has_crc(mp))
goto out;
error = -EROFS;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
goto out;
}
@@ -455,81 +440,95 @@ static inline void xchk_postmortem(struct xfs_scrub *sc)
/* Dispatch metadata scrubbing. */
int
xfs_scrub_metadata(
- struct xfs_inode *ip,
+ struct file *file,
struct xfs_scrub_metadata *sm)
{
- struct xfs_scrub sc = {
- .mp = ip->i_mount,
- .sm = sm,
- .sa = {
- .agno = NULLAGNUMBER,
- },
- };
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_scrub *sc;
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
(sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
- trace_xchk_start(ip, sm, error);
+ trace_xchk_start(XFS_I(file_inode(file)), sm, error);
/* Forbidden if we are shut down or mounted norecovery. */
error = -ESHUTDOWN;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
goto out;
error = -ENOTRECOVERABLE;
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
goto out;
error = xchk_validate_inputs(mp, sm);
if (error)
goto out;
- xchk_experimental_warning(mp);
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
+ "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
+
+ sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ if (!sc) {
+ error = -ENOMEM;
+ goto out;
+ }
- sc.ops = &meta_scrub_ops[sm->sm_type];
- sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
+ sc->mp = mp;
+ sc->file = file;
+ sc->sm = sm;
+ sc->ops = &meta_scrub_ops[sm->sm_type];
+ sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
+ /*
+ * When repairs are allowed, prevent freezing or readonly remount while
+ * scrub is running with a real transaction.
+ */
+ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+ error = mnt_want_write_file(sc->file);
+ if (error)
+ goto out_sc;
+ }
+
/* Set up for the operation. */
- error = sc.ops->setup(&sc, ip);
+ error = sc->ops->setup(sc);
if (error)
goto out_teardown;
/* Scrub for errors. */
- error = sc.ops->scrub(&sc);
- if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
+ error = sc->ops->scrub(sc);
+ if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
* Tear down everything we hold, then set up again with
* preparation for worst-case scenarios.
*/
- error = xchk_teardown(&sc, ip, 0);
+ error = xchk_teardown(sc, 0);
if (error)
- goto out;
- sc.flags |= XCHK_TRY_HARDER;
+ goto out_sc;
+ sc->flags |= XCHK_TRY_HARDER;
goto retry_op;
- } else if (error)
+ } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
goto out_teardown;
- xchk_update_health(&sc);
+ xchk_update_health(sc);
- if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc.flags & XREP_ALREADY_FIXED)) {
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED)) {
bool needs_fix;
/* Let debug users force us into the repair routines. */
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
- needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
+ needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN));
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
if (!needs_fix) {
- sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
@@ -537,28 +536,30 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(ip, &sc);
+ error = xrep_attempt(sc);
if (error == -EAGAIN) {
/*
* Either the repair function succeeded or it couldn't
* get all the resources it needs; either way, we go
* back to the beginning and call the scrub function.
*/
- error = xchk_teardown(&sc, ip, 0);
+ error = xchk_teardown(sc, 0);
if (error) {
xrep_failure(mp);
- goto out;
+ goto out_sc;
}
goto retry_op;
}
}
out_nofix:
- xchk_postmortem(&sc);
+ xchk_postmortem(sc);
out_teardown:
- error = xchk_teardown(&sc, ip, error);
+ error = xchk_teardown(sc, error);
+out_sc:
+ kmem_free(sc);
out:
- trace_xchk_done(ip, sm, error);
+ trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
error = 0;
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index ad1ceb44a628..3de5287e98d8 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -18,8 +18,7 @@ enum xchk_type {
struct xchk_meta_ops {
/* Acquire whatever resources are needed for the operation. */
- int (*setup)(struct xfs_scrub *,
- struct xfs_inode *);
+ int (*setup)(struct xfs_scrub *sc);
/* Examine metadata for errors. */
int (*scrub)(struct xfs_scrub *);
@@ -28,7 +27,7 @@ struct xchk_meta_ops {
int (*repair)(struct xfs_scrub *);
/* Decide if we even have this piece of metadata. */
- bool (*has)(struct xfs_sb *);
+ bool (*has)(struct xfs_mount *);
/* type describing required/allowed inputs */
enum xchk_type type;
@@ -36,7 +35,6 @@ struct xchk_meta_ops {
/* Buffer pointers and btree cursors for an entire AG. */
struct xchk_ag {
- xfs_agnumber_t agno;
struct xfs_perag *pag;
/* AG btree roots */
@@ -59,7 +57,18 @@ struct xfs_scrub {
struct xfs_scrub_metadata *sm;
const struct xchk_meta_ops *ops;
struct xfs_trans *tp;
+
+ /* File that scrub was called with. */
+ struct file *file;
+
+ /*
+ * File that is undergoing the scrub operation. This can differ from
+ * the file that scrub was called with if we're checking file-based fs
+ * metadata (e.g. rt bitmaps) or if we're doing a scrub-by-handle for
+ * something that can't be opened directly (e.g. symlinks).
+ */
struct xfs_inode *ip;
+
void *buf;
uint ilock_flags;
@@ -79,7 +88,6 @@ struct xfs_scrub {
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
-#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */
#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */
#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 5641ae512c9e..75311f8daeeb 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -18,15 +18,14 @@
/* Set us up to scrub a symbolic link. */
int
xchk_setup_symlink(
- struct xfs_scrub *sc,
- struct xfs_inode *ip)
+ struct xfs_scrub *sc)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
if (!sc->buf)
return -ENOMEM;
- return xchk_setup_inode_contents(sc, ip, 0);
+ return xchk_setup_inode_contents(sc, 0);
}
/* Symbolic links. */
@@ -42,8 +41,8 @@ xchk_symlink(
if (!S_ISLNK(VFS_I(ip)->i_mode))
return -ENOENT;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- len = ip->i_d.di_size;
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ len = ip->i_disk_size;
/* Plausible size? */
if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
@@ -52,9 +51,9 @@ xchk_symlink(
}
/* Inline symlink? */
- if (ifp->if_flags & XFS_IFINLINE) {
- if (len > XFS_IFORK_DSIZE(ip) ||
- len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
+ if (len > xfs_inode_data_fork_size(ip) ||
+ len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 9eaab2eb5ed3..b5f94676c37c 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,6 +13,7 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "scrub/scrub.h"
+#include "xfs_ag.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
@@ -20,13 +21,14 @@ xchk_btree_cur_fsbno(
struct xfs_btree_cur *cur,
int level)
{
- if (level < cur->bc_nlevels && cur->bc_bufs[level])
- return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
- else if (level == cur->bc_nlevels - 1 &&
- cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino);
- else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0);
+ if (level < cur->bc_nlevels && cur->bc_levels[level].bp)
+ return XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(cur->bc_levels[level].bp));
+
+ if (level == cur->bc_nlevels - 1 &&
+ (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
+
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 096203119934..93ece6df02e3 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -2,6 +2,10 @@
/*
* Copyright (C) 2017 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
+ * as they can change at any time. See xfs_trace.h for documentation of
+ * specific units found in tracepoint output.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM xfs_scrub
@@ -79,6 +83,16 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
{ XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+#define XFS_SCRUB_FLAG_STRINGS \
+ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
+ { XFS_SCRUB_OFLAG_CORRUPT, "corrupt" }, \
+ { XFS_SCRUB_OFLAG_PREEN, "preen" }, \
+ { XFS_SCRUB_OFLAG_XFAIL, "xfail" }, \
+ { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \
+ { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \
+ { XFS_SCRUB_OFLAG_WARNING, "warning" }, \
+ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -103,14 +117,14 @@ DECLARE_EVENT_CLASS(xchk_class,
__entry->flags = sm->sm_flags;
__entry->error = error;
),
- TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d",
+ TP_printk("dev %d:%d ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags (%s) error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->inum,
__entry->gen,
- __entry->flags,
+ __print_flags(__entry->flags, "|", XFS_SCRUB_FLAG_STRINGS),
__entry->error)
)
#define DEFINE_SCRUB_EVENT(name) \
@@ -145,7 +159,7 @@ TRACE_EVENT(xchk_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
@@ -176,10 +190,10 @@ TRACE_EVENT(xchk_file_op_error,
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->error,
@@ -193,29 +207,21 @@ DECLARE_EVENT_CLASS(xchk_block_error_class,
__field(dev_t, dev)
__field(unsigned int, type)
__field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, bno)
+ __field(xfs_agblock_t, agbno)
__field(void *, ret_ip)
),
TP_fast_assign(
- xfs_fsblock_t fsbno;
- xfs_agnumber_t agno;
- xfs_agblock_t bno;
-
- fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
- agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
- bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
-
__entry->dev = sc->mp->m_super->s_dev;
__entry->type = sc->sm->sm_type;
- __entry->agno = agno;
- __entry->bno = bno;
+ __entry->agno = xfs_daddr_to_agno(sc->mp, daddr);
+ __entry->agbno = xfs_daddr_to_agbno(sc->mp, daddr);
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
- __entry->bno,
+ __entry->agbno,
__entry->ret_ip)
)
@@ -281,10 +287,10 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class,
__entry->offset = offset;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->ret_ip)
@@ -342,11 +348,11 @@ TRACE_EVENT(xchk_btree_op_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
@@ -379,20 +385,20 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->ip->i_ino;
- __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
@@ -425,10 +431,10 @@ TRACE_EVENT(xchk_btree_error,
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
@@ -459,19 +465,19 @@ TRACE_EVENT(xchk_ifork_btree_error,
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->ip->i_ino;
- __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->whichfork,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
@@ -505,9 +511,9 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
+ __entry->ptr = cur->bc_levels[level].ptr;
),
- TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
@@ -580,7 +586,7 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__entry->holemask = holemask;
__entry->cluster_ino = cluster_ino;
),
- TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x daddr 0x%llx bbcount 0x%x chunkino 0x%x nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startino,
@@ -670,7 +676,7 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -707,7 +713,7 @@ DECLARE_EVENT_CLASS(xrep_rmap_class,
__entry->offset = offset;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -745,7 +751,7 @@ TRACE_EVENT(xrep_refcount_extent_fn,
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -769,7 +775,7 @@ TRACE_EVENT(xrep_init_btblock,
__entry->agbno = agbno;
__entry->btnum = btnum;
),
- TP_printk("dev %d:%d agno %u agbno %u btree %s",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -793,7 +799,7 @@ TRACE_EVENT(xrep_findroot_block,
__entry->magic = magic;
__entry->level = level;
),
- TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x magic 0x%x level %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -821,7 +827,7 @@ TRACE_EVENT(xrep_calc_ag_resblks,
__entry->freelen = freelen;
__entry->usedlen = usedlen;
),
- TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u",
+ TP_printk("dev %d:%d agno 0x%x icount %u aglen %u freelen %u usedlen %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->icount,
@@ -850,7 +856,7 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__entry->rmapbt_sz = rmapbt_sz;
__entry->refcbt_sz = refcbt_sz;
),
- TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x bnobt %u inobt %u rmapbt %u refcountbt %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->bnobt_sz,
@@ -894,7 +900,7 @@ TRACE_EVENT(xrep_ialloc_insert,
__entry->freecount = freecount;
__entry->freemask = freemask;
),
- TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startino,
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
index 2897ba3a17e6..2ceae614ade8 100644
--- a/fs/xfs/scrub/xfs_scrub.h
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -7,9 +7,9 @@
#define __XFS_SCRUB_H__
#ifndef CONFIG_XFS_ONLINE_SCRUB
-# define xfs_scrub_metadata(ip, sm) (-ENOTTY)
+# define xfs_scrub_metadata(file, sm) (-ENOTTY)
#else
-int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm);
+int xfs_scrub_metadata(struct file *file, struct xfs_scrub_metadata *sm);
#endif /* CONFIG_XFS_ONLINE_SCRUB */
#endif /* __XFS_SCRUB_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index cd743fad8478..b744c62052b6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -10,10 +10,14 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_acl.h"
+#include "xfs_trans.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl_xattr.h>
@@ -67,10 +71,12 @@ xfs_acl_from_disk(
switch (acl_e->e_tag) {
case ACL_USER:
- acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+ acl_e->e_uid = make_kuid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_GROUP:
- acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
+ acl_e->e_gid = make_kgid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
@@ -103,10 +109,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
ace->ae_tag = cpu_to_be32(acl_e->e_tag);
switch (acl_e->e_tag) {
case ACL_USER:
- ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+ ace->ae_id = cpu_to_be32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
break;
case ACL_GROUP:
- ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+ ace->ae_id = cpu_to_be32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
break;
default:
ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
@@ -118,129 +126,124 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
}
struct posix_acl *
-xfs_get_acl(struct inode *inode, int type)
+xfs_get_acl(struct inode *inode, int type, bool rcu)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct posix_acl *acl = NULL;
- struct xfs_acl *xfs_acl = NULL;
- unsigned char *ea_name;
- int error;
- int len;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct posix_acl *acl = NULL;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_ROOT,
+ .valuelen = XFS_ACL_MAX_SIZE(mp),
+ };
+ int error;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
trace_xfs_get_acl(ip);
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = SGI_ACL_FILE;
+ args.name = SGI_ACL_FILE;
break;
case ACL_TYPE_DEFAULT:
- ea_name = SGI_ACL_DEFAULT;
+ args.name = SGI_ACL_DEFAULT;
break;
default:
BUG();
}
+ args.namelen = strlen(args.name);
/*
- * If we have a cached ACLs value just return it, not need to
- * go out to the disk.
+ * If the attribute doesn't exist make sure we have a negative cache
+ * entry, for any other error assume it is transient.
*/
- len = XFS_ACL_MAX_SIZE(ip->i_mount);
- error = xfs_attr_get(ip, ea_name, strlen(ea_name),
- (unsigned char **)&xfs_acl, &len,
- ATTR_ALLOC | ATTR_ROOT);
- if (error) {
- /*
- * If the attribute doesn't exist make sure we have a negative
- * cache entry, for any other error assume it is transient.
- */
- if (error != -ENOATTR)
- acl = ERR_PTR(error);
- } else {
- acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len,
- XFS_ACL_MAX_ENTRIES(ip->i_mount));
- kmem_free(xfs_acl);
+ error = xfs_attr_get(&args);
+ if (!error) {
+ acl = xfs_acl_from_disk(mp, args.value, args.valuelen,
+ XFS_ACL_MAX_ENTRIES(mp));
+ } else if (error != -ENOATTR) {
+ acl = ERR_PTR(error);
}
+
+ kmem_free(args.value);
return acl;
}
int
__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct xfs_inode *ip = XFS_I(inode);
- unsigned char *ea_name;
- int error;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_ROOT,
+ };
+ int error;
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = SGI_ACL_FILE;
+ args.name = SGI_ACL_FILE;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
- ea_name = SGI_ACL_DEFAULT;
+ args.name = SGI_ACL_DEFAULT;
break;
default:
return -EINVAL;
}
+ args.namelen = strlen(args.name);
if (acl) {
- struct xfs_acl *xfs_acl;
- int len = XFS_ACL_MAX_SIZE(ip->i_mount);
-
- xfs_acl = kmem_zalloc_large(len, 0);
- if (!xfs_acl)
+ args.valuelen = XFS_ACL_SIZE(acl->a_count);
+ args.value = kvzalloc(args.valuelen, GFP_KERNEL);
+ if (!args.value)
return -ENOMEM;
-
- xfs_acl_to_disk(xfs_acl, acl);
-
- /* subtract away the unused acl entries */
- len -= sizeof(struct xfs_acl_entry) *
- (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
-
- error = xfs_attr_set(ip, ea_name, strlen(ea_name),
- (unsigned char *)xfs_acl, len, ATTR_ROOT);
-
- kmem_free(xfs_acl);
- } else {
- /*
- * A NULL ACL argument means we want to remove the ACL.
- */
- error = xfs_attr_remove(ip, ea_name,
- strlen(ea_name),
- ATTR_ROOT);
-
- /*
- * If the attribute didn't exist to start with that's fine.
- */
- if (error == -ENOATTR)
- error = 0;
+ xfs_acl_to_disk(args.value, acl);
}
+ error = xfs_attr_change(&args);
+ kmem_free(args.value);
+
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (!acl && error == -ENOATTR)
+ error = 0;
if (!error)
set_cached_acl(inode, type, acl);
return error;
}
static int
-xfs_set_mode(struct inode *inode, umode_t mode)
+xfs_acl_set_mode(
+ struct inode *inode,
+ umode_t mode)
{
- int error = 0;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
- if (mode != inode->i_mode) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
- iattr.ia_mode = mode;
- iattr.ia_ctime = current_time(inode);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
+ return error;
- error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
- }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ inode->i_mode = mode;
+ inode->i_ctime = current_time(inode);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return error;
+ if (xfs_has_wsync(mp))
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
}
int
-xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
umode_t mode;
bool set_mode = false;
@@ -254,24 +257,36 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
if (type == ACL_TYPE_ACCESS) {
- error = posix_acl_update_mode(inode, &mode, &acl);
+ error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl);
if (error)
return error;
set_mode = true;
}
set_acl:
- error = __xfs_set_acl(inode, acl, type);
- if (error)
- return error;
-
/*
* We set the mode after successfully updating the ACL xattr because the
* xattr update can fail at ENOSPC and we don't want to change the mode
* if the ACL update hasn't been applied.
*/
- if (set_mode)
- error = xfs_set_mode(inode, mode);
-
+ error = __xfs_set_acl(inode, acl, type);
+ if (!error && set_mode && mode != inode->i_mode)
+ error = xfs_acl_set_mode(inode, mode);
return error;
}
+
+/*
+ * Invalidate any cached ACLs if the user has bypassed the ACL interface.
+ * We don't validate the content whatsoever so it is caller responsibility to
+ * provide data in valid format and ensure i_mode is consistent.
+ */
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name)
+{
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 94615e34bc86..263404d0bfda 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -10,17 +10,22 @@ struct inode;
struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
-extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+void xfs_forget_acl(struct inode *inode, const char *name);
#else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
+{
+ return 0;
+}
+static inline void xfs_forget_acl(struct inode *inode, const char *name)
{
- return NULL;
}
-# define xfs_set_acl NULL
#endif /* CONFIG_XFS_POSIX_ACL */
-extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
-
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 58e937be24ce..5d1a995b15f8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,47 +36,26 @@ XFS_WPC(struct iomap_writepage_ctx *ctx)
static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
{
return ioend->io_offset + ioend->io_size >
- XFS_I(ioend->io_inode)->i_d.di_size;
-}
-
-STATIC int
-xfs_setfilesize_trans_alloc(
- struct iomap_ioend *ioend)
-{
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
- if (error)
- return error;
-
- ioend->io_private = tp;
-
- /*
- * We may pass freeze protection with a transaction. So tell lockdep
- * we released it.
- */
- __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
- /*
- * We hand off the transaction to the completion thread now, so
- * clear the flag here.
- */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
- return 0;
+ XFS_I(ioend->io_inode)->i_disk_size;
}
/*
* Update on-disk file size now that data has been written to disk.
*/
-STATIC int
-__xfs_setfilesize(
+int
+xfs_setfilesize(
struct xfs_inode *ip,
- struct xfs_trans *tp,
xfs_off_t offset,
size_t size)
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
xfs_fsize_t isize;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
+ if (error)
+ return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_new_eof(ip, offset + size);
@@ -88,55 +67,13 @@ __xfs_setfilesize(
trace_xfs_setfilesize(ip, offset, size);
- ip->i_d.di_size = isize;
+ ip->i_disk_size = isize;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return xfs_trans_commit(tp);
}
-int
-xfs_setfilesize(
- struct xfs_inode *ip,
- xfs_off_t offset,
- size_t size)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
- if (error)
- return error;
-
- return __xfs_setfilesize(ip, tp, offset, size);
-}
-
-STATIC int
-xfs_setfilesize_ioend(
- struct iomap_ioend *ioend,
- int error)
-{
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_trans *tp = ioend->io_private;
-
- /*
- * The transaction may have been allocated in the I/O submission thread,
- * thus we need to mark ourselves as being in a transaction manually.
- * Similarly for freeze protection.
- */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
- __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
-
- /* we abort the update if there was an IO error */
- if (error) {
- xfs_trans_cancel(tp);
- return error;
- }
-
- return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
-}
-
/*
* IO write completion.
*/
@@ -145,6 +82,7 @@ xfs_end_ioend(
struct iomap_ioend *ioend)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -158,20 +96,28 @@ xfs_end_ioend(
nofs_flag = memalloc_nofs_save();
/*
- * Just clean up the in-memory strutures if the fs has been shut down.
+ * Just clean up the in-memory structures if the fs has been shut down.
*/
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ if (xfs_is_shutdown(mp)) {
error = -EIO;
goto done;
}
/*
- * Clean up any COW blocks on an I/O error.
+ * Clean up all COW blocks and underlying data fork delalloc blocks on
+ * I/O error. The delalloc punch is required because this ioend was
+ * mapped to blocks in the COW fork and the associated pages are no
+ * longer dirty. If we don't remove delalloc blocks here, they become
+ * stale and can corrupt free space accounting on unmount.
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
+ xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, offset),
+ XFS_B_TO_FSB(mp, size));
+ }
goto done;
}
@@ -182,36 +128,28 @@ xfs_end_ioend(
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- else
- ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
+ if (!error && xfs_ioend_is_append(ioend))
+ error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
done:
- if (ioend->io_private)
- error = xfs_setfilesize_ioend(ioend, error);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
/*
- * If the to be merged ioend has a preallocated transaction for file
- * size updates we need to ensure the ioend it is merged into also
- * has one. If it already has one we can simply cancel the transaction
- * as it is guaranteed to be clean.
+ * Finish all pending IO completions that require transactional modifications.
+ *
+ * We try to merge physical and logically contiguous ioends before completion to
+ * minimise the number of transactions we need to perform during IO completion.
+ * Both unwritten extent conversion and COW remapping need to iterate and modify
+ * one physical extent at a time, so we gain nothing by merging physically
+ * discontiguous extents here.
+ *
+ * The ioend chain length that we can be processing here is largely unbound in
+ * length and we may have to perform significant amounts of work on each ioend
+ * to complete it. Hence we have to be careful about holding the CPU for too
+ * long in this loop.
*/
-static void
-xfs_ioend_merge_private(
- struct iomap_ioend *ioend,
- struct iomap_ioend *next)
-{
- if (!ioend->io_private) {
- ioend->io_private = next->io_private;
- next->io_private = NULL;
- } else {
- xfs_setfilesize_ioend(next, -ECANCELED);
- }
-}
-
-/* Finish all pending io completions. */
void
xfs_end_io(
struct work_struct *work)
@@ -230,18 +168,12 @@ xfs_end_io(
while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
io_list))) {
list_del_init(&ioend->io_list);
- iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
+ iomap_ioend_try_merge(ioend, &tmp);
xfs_end_ioend(ioend);
+ cond_resched();
}
}
-static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
-{
- return ioend->io_private ||
- ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED);
-}
-
STATIC void
xfs_end_bio(
struct bio *bio)
@@ -250,8 +182,6 @@ xfs_end_bio(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
- ASSERT(xfs_ioend_needs_workqueue(ioend));
-
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
@@ -346,14 +276,14 @@ xfs_map_blocks(
ssize_t count = i_blocksize(inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_fileoff_t cow_fsb = NULLFILEOFF;
- int whichfork = XFS_DATA_FORK;
+ xfs_fileoff_t cow_fsb;
+ int whichfork;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -381,9 +311,10 @@ xfs_map_blocks(
* landed in a hole and we skip the block.
*/
retry:
+ cow_fsb = NULLFILEOFF;
+ whichfork = XFS_DATA_FORK;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- (ip->i_df.if_flags & XFS_IFEXTENTS));
+ ASSERT(!xfs_need_iread_extents(&ip->i_df));
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -442,7 +373,7 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
@@ -499,17 +430,11 @@ xfs_prepare_ioend(
ioend->io_offset, ioend->io_size);
}
- /* Reserve log space if we might write beyond the on-disk inode size. */
- if (!status &&
- ((ioend->io_flags & IOMAP_F_SHARED) ||
- ioend->io_type != IOMAP_UNWRITTEN) &&
- xfs_ioend_is_append(ioend) &&
- !ioend->io_private)
- status = xfs_setfilesize_trans_alloc(ioend);
-
memalloc_nofs_restore(nofs_flag);
- if (xfs_ioend_needs_workqueue(ioend))
+ /* send ioends that might require a transaction to the completion wq */
+ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
+ (ioend->io_flags & IOMAP_F_SHARED))
ioend->io_bio->bi_end_io = xfs_end_bio;
return status;
}
@@ -526,54 +451,51 @@ xfs_prepare_ioend(
* see a ENOSPC in writeback).
*/
static void
-xfs_discard_page(
- struct page *page)
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- loff_t offset = page_offset(page);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
+ size_t offset = offset_in_folio(folio, pos);
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
+ xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
int error;
- if (XFS_FORCED_SHUTDOWN(mp))
- goto out_invalidate;
+ if (xfs_is_shutdown(mp))
+ return;
- xfs_alert(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
- page, ip->i_ino, offset);
+ xfs_alert_ratelimited(mp,
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- PAGE_SIZE / i_blocksize(inode));
- if (error && !XFS_FORCED_SHUTDOWN(mp))
+ i_blocks_per_folio(inode, folio) - pageoff_fsb);
+ if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
-out_invalidate:
- iomap_invalidatepage(page, 0, PAGE_SIZE);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
.prepare_ioend = xfs_prepare_ioend,
- .discard_page = xfs_discard_page,
+ .discard_folio = xfs_discard_folio,
};
STATIC int
-xfs_vm_writepage(
- struct page *page,
- struct writeback_control *wbc)
-{
- struct xfs_writepage_ctx wpc = { };
-
- return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
-}
-
-STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
+ /*
+ * Writing back data in a transaction context can result in recursive
+ * transactions. This is bad, so issue a warning and get out of here.
+ */
+ if (WARN_ON_ONCE(current->journal_info))
+ return 0;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -614,21 +536,18 @@ xfs_vm_bmap(
}
STATIC int
-xfs_vm_readpage(
+xfs_vm_read_folio(
struct file *unused,
- struct page *page)
+ struct folio *folio)
{
- return iomap_readpage(page, &xfs_read_iomap_ops);
+ return iomap_read_folio(folio, &xfs_read_iomap_ops);
}
-STATIC int
-xfs_vm_readpages(
- struct file *unused,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned nr_pages)
+STATIC void
+xfs_vm_readahead(
+ struct readahead_control *rac)
{
- return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops);
+ iomap_readahead(rac, &xfs_read_iomap_ops);
}
static int
@@ -643,16 +562,15 @@ xfs_iomap_swapfile_activate(
}
const struct address_space_operations xfs_address_space_operations = {
- .readpage = xfs_vm_readpage,
- .readpages = xfs_vm_readpages,
- .writepage = xfs_vm_writepage,
+ .read_folio = xfs_vm_read_folio,
+ .readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .set_page_dirty = iomap_set_page_dirty,
- .releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .dirty_folio = filemap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = xfs_iomap_swapfile_activate,
@@ -661,7 +579,6 @@ const struct address_space_operations xfs_address_space_operations = {
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index bbfa6ba84dcd..5db87b34fb6e 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -15,10 +15,10 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_attr.h"
#include "xfs_attr_remote.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
-#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_dir2.h"
@@ -145,19 +145,20 @@ xfs_attr3_node_inactive(
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
}
xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr);
- parent_blkno = bp->b_bn;
+ parent_blkno = xfs_buf_daddr(bp);
if (!ichdr.count) {
xfs_trans_brelse(*trans, bp);
return 0;
}
child_fsb = be32_to_cpu(ichdr.btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ bp = NULL;
/*
* If this is the node level just above the leaves, simply loop
@@ -177,7 +178,7 @@ xfs_attr3_node_inactive(
return error;
/* save for re-read later */
- child_blkno = XFS_BUF_ADDR(child_bp);
+ child_blkno = xfs_buf_daddr(child_bp);
/*
* Invalidate the subtree, however we have to.
@@ -194,7 +195,7 @@ xfs_attr3_node_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
break;
default:
- xfs_buf_corruption_error(child_bp);
+ xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
error = -EFSCORRUPTED;
break;
@@ -211,12 +212,8 @@ xfs_attr3_node_inactive(
&child_bp);
if (error)
return error;
- error = bp->b_error;
- if (error) {
- xfs_trans_brelse(*trans, child_bp);
- return error;
- }
xfs_trans_binval(*trans, child_bp);
+ child_bp = NULL;
/*
* If we're not done, re-read the parent to get the next
@@ -233,6 +230,7 @@ xfs_attr3_node_inactive(
bp->b_addr);
child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
+ bp = NULL;
}
/*
* Atomically commit the whole invalidate stuff.
@@ -271,7 +269,7 @@ xfs_attr3_root_inactive(
error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK);
if (error)
return error;
- blkno = bp->b_bn;
+ blkno = xfs_buf_daddr(bp);
/*
* Invalidate the tree, even if the "tree" is only a single leaf block.
@@ -289,7 +287,7 @@ xfs_attr3_root_inactive(
break;
default:
error = -EFSCORRUPTED;
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
break;
}
@@ -338,7 +336,7 @@ xfs_attr_inactive(
ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
- if (!XFS_IFORK_Q(dp))
+ if (!xfs_inode_has_attr_fork(dp))
goto out_destroy_fork;
xfs_iunlock(dp, lock_mode);
@@ -351,7 +349,7 @@ xfs_attr_inactive(
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(dp, lock_mode);
- if (!XFS_IFORK_Q(dp))
+ if (!xfs_inode_has_attr_fork(dp))
goto out_cancel;
/*
@@ -362,12 +360,11 @@ xfs_attr_inactive(
/*
* Invalidate and truncate the attribute fork extents. Make sure the
- * fork actually has attributes as otherwise the invalidation has no
+ * fork actually has xattr blocks as otherwise the invalidation has no
* blocks to read and returns an error. In this case, just do the fork
* removal below.
*/
- if (xfs_inode_hasattr(dp) &&
- dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_af.if_nextents > 0) {
error = xfs_attr3_root_inactive(&trans, dp);
if (error)
goto out_cancel;
@@ -388,8 +385,7 @@ out_cancel:
xfs_trans_cancel(trans);
out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
- if (dp->i_afp)
- xfs_idestroy_fork(dp, XFS_ATTR_FORK);
+ xfs_ifork_zap_attr(dp);
if (lock_mode)
xfs_iunlock(dp, lock_mode);
return error;
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
new file mode 100644
index 000000000000..2788a6f2edcd
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache *xfs_attri_cache;
+struct kmem_cache *xfs_attrd_cache;
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+/*
+ * Shared xattr name/value buffers for logged extended attribute operations
+ *
+ * When logging updates to extended attributes, we can create quite a few
+ * attribute log intent items for a single xattr update. To avoid cycling the
+ * memory allocator and memcpy overhead, the name (and value, for setxattr)
+ * are kept in a refcounted object that is shared across all related log items
+ * and the upper-level deferred work state structure. The shared buffer has
+ * a control structure, followed by the name, and then the value.
+ */
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_get(
+ struct xfs_attri_log_nameval *nv)
+{
+ if (!refcount_inc_not_zero(&nv->refcount))
+ return NULL;
+ return nv;
+}
+
+static inline void
+xfs_attri_log_nameval_put(
+ struct xfs_attri_log_nameval *nv)
+{
+ if (!nv)
+ return;
+ if (refcount_dec_and_test(&nv->refcount))
+ kvfree(nv);
+}
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_alloc(
+ const void *name,
+ unsigned int name_len,
+ const void *value,
+ unsigned int value_len)
+{
+ struct xfs_attri_log_nameval *nv;
+
+ /*
+ * This could be over 64kB in length, so we have to use kvmalloc() for
+ * this. But kvmalloc() utterly sucks, so we use our own version.
+ */
+ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
+ name_len + value_len);
+
+ nv->name.i_addr = nv + 1;
+ nv->name.i_len = name_len;
+ nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
+ memcpy(nv->name.i_addr, name, name_len);
+
+ if (value_len) {
+ nv->value.i_addr = nv->name.i_addr + name_len;
+ nv->value.i_len = value_len;
+ memcpy(nv->value.i_addr, value, value_len);
+ } else {
+ nv->value.i_addr = NULL;
+ nv->value.i_len = 0;
+ }
+ nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
+
+ refcount_set(&nv->refcount, 1);
+ return nv;
+}
+
+STATIC void
+xfs_attri_item_free(
+ struct xfs_attri_log_item *attrip)
+{
+ kmem_free(attrip->attri_item.li_lv_shadow);
+ xfs_attri_log_nameval_put(attrip->attri_nameval);
+ kmem_cache_free(xfs_attri_cache, attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+ struct xfs_attri_log_item *attrip)
+{
+ ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+ if (!atomic_dec_and_test(&attrip->attri_refcount))
+ return;
+
+ xfs_trans_ail_delete(&attrip->attri_item, 0);
+ xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(nv->name.i_len);
+
+ if (!nv->value.i_len)
+ return;
+
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use 1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+
+ attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+ attrip->attri_format.alfi_size = 1;
+
+ /*
+ * This size accounting must be done before copying the attrip into the
+ * iovec. If we do it after, the wrong size will be recorded to the log
+ * and we trip across assertion checks for bad region sizes later during
+ * the log recovery.
+ */
+
+ ASSERT(nv->name.i_len > 0);
+ attrip->attri_format.alfi_size++;
+
+ if (nv->value.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+ &attrip->attri_format,
+ sizeof(struct xfs_attri_log_format));
+ xlog_copy_from_iovec(lv, &vecp, &nv->name);
+ if (nv->value.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->value);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item. Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_nameval *nv)
+{
+ struct xfs_attri_log_item *attrip;
+
+ attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ /*
+ * Grab an extra reference to the name/value buffer for this log item.
+ * The caller retains its own reference!
+ */
+ attrip->attri_nameval = xfs_attri_log_nameval_get(nv);
+ ASSERT(attrip->attri_nameval);
+
+ xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+ &xfs_attri_item_ops);
+ attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+ atomic_set(&attrip->attri_refcount, 2);
+
+ return attrip;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+ kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kmem_cache_free(xfs_attrd_cache, attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+ attrdp->attrd_format.alfd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+ &attrdp->attrd_format,
+ sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+
+ xfs_attri_release(attrdp->attrd_attrip);
+ xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty.. An attr operation may be a set or a remove. Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+ struct xfs_attr_intent *attr,
+ struct xfs_attrd_log_item *attrdp)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ error = -EAGAIN;
+out:
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the ATTRI and frees the ATTRD
+ * 2.) shuts down the filesystem
+ */
+ args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+ /*
+ * attr intent/done items are null when logged attributes are disabled
+ */
+ if (attrdp)
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+ struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip,
+ const struct xfs_attr_intent *attr)
+{
+ struct xfs_attri_log_format *attrp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+ /*
+ * At this point the xfs_attr_intent has been constructed, and we've
+ * created the log intent. Fill in the attri log item and log format
+ * structure with fields from this xfs_attr_intent
+ */
+ attrp = &attrip->attri_format;
+ attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
+ attrp->alfi_op_flags = attr->xattri_op_flags;
+ attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
+ attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
+ ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
+ attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attr_intent *attr;
+ struct xfs_da_args *args;
+
+ ASSERT(count == 1);
+
+ /*
+ * Each attr item only performs one attribute operation at a time, so
+ * this is a list of one
+ */
+ attr = list_first_entry_or_null(items, struct xfs_attr_intent,
+ xattri_list);
+ args = attr->xattri_da_args;
+
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ return NULL;
+
+ /*
+ * Create a buffer to store the attribute name and value. This buffer
+ * will be shared between the higher level deferred xattr work state
+ * and the lower level xattr log items.
+ */
+ if (!attr->xattri_nameval) {
+ /*
+ * Transfer our reference to the name/value buffer to the
+ * deferred work state structure.
+ */
+ attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
+ args->namelen, args->value, args->valuelen);
+ }
+
+ attrip = xfs_attri_init(mp, attr->xattri_nameval);
+ xfs_trans_add_item(tp, &attrip->attri_item);
+ xfs_attr_log_item(tp, attrip, attr);
+
+ return &attrip->attri_item;
+}
+
+static inline void
+xfs_attr_free_item(
+ struct xfs_attr_intent *attr)
+{
+ if (attr->xattri_da_state)
+ xfs_da_state_free(attr->xattri_da_state);
+ xfs_attri_log_nameval_put(attr->xattri_nameval);
+ if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
+ kmem_free(attr);
+ else
+ kmem_cache_free(xfs_attr_intent_cache, attr);
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_attr_intent *attr;
+ struct xfs_attrd_log_item *done_item = NULL;
+ int error;
+
+ attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ if (done)
+ done_item = ATTRD_ITEM(done);
+
+ /*
+ * Always reset trans after EAGAIN cycle
+ * since the transaction is new
+ */
+ attr->xattri_da_args->trans = tp;
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error != -EAGAIN)
+ xfs_attr_free_item(attr);
+
+ return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_attr_intent *attr;
+
+ attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ xfs_attr_free_item(attr);
+}
+
+STATIC bool
+xfs_attri_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attrp)
+{
+ unsigned int op = attrp->alfi_op_flags &
+ XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+ if (attrp->__pad != 0)
+ return false;
+
+ if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
+ return false;
+
+ if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
+ return false;
+
+ /* alfi_op_flags should be either a set or remove */
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ break;
+ default:
+ return false;
+ }
+
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+
+ if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+ (attrp->alfi_name_len == 0))
+ return false;
+
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_intent *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res tres;
+ struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+ int error;
+ int total;
+ int local;
+ struct xfs_attrd_log_item *done_item = NULL;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ return -EFSCORRUPTED;
+
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ if (error)
+ return error;
+
+ attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
+ sizeof(struct xfs_da_args), KM_NOFS);
+ args = (struct xfs_da_args *)(attr + 1);
+
+ attr->xattri_da_args = args;
+ attr->xattri_op_flags = attrp->alfi_op_flags &
+ XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+ /*
+ * We're reconstructing the deferred work state structure from the
+ * recovered log item. Grab a reference to the name/value buffer and
+ * attach it to the new work state.
+ */
+ attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
+ ASSERT(attr->xattri_nameval);
+
+ args->dp = ip;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->name = nv->name.i_addr;
+ args->namelen = nv->name.i_len;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
+ args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
+ XFS_DA_OP_LOGGED;
+
+ ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
+
+ switch (attr->xattri_op_flags) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ args->value = nv->value.i_addr;
+ args->valuelen = nv->value.i_len;
+ args->total = xfs_attr_calc_size(args, &local);
+ if (xfs_inode_hasattr(args->dp))
+ attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+ else
+ attr->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ if (!xfs_inode_hasattr(args->dp))
+ goto out;
+ attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ goto out;
+
+ args->trans = tp;
+ done_item = xfs_trans_get_attrd(tp, attrip);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error == -EAGAIN) {
+ /*
+ * There's more work to do, so add the intent item to this
+ * transaction so that we can continue it later.
+ */
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ return 0;
+ }
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+out:
+ xfs_attr_free_item(attr);
+ return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_attrd_log_item *attrdp;
+ struct xfs_attri_log_item *old_attrip;
+ struct xfs_attri_log_item *new_attrip;
+ struct xfs_attri_log_format *new_attrp;
+ struct xfs_attri_log_format *old_attrp;
+
+ old_attrip = ATTRI_ITEM(intent);
+ old_attrp = &old_attrip->attri_format;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ attrdp = xfs_trans_get_attrd(tp, old_attrip);
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ /*
+ * Create a new log item that shares the same name/value buffer as the
+ * old log item.
+ */
+ new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval);
+ new_attrp = &new_attrip->attri_format;
+
+ new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+ new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
+
+ xfs_trans_add_item(tp, &new_attrip->attri_item);
+ set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+ return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attri_log_format *attri_formatp;
+ struct xfs_attri_log_nameval *nv;
+ const void *attr_value = NULL;
+ const void *attr_name;
+ size_t len;
+
+ attri_formatp = item->ri_buf[0].i_addr;
+ attr_name = item->ri_buf[1].i_addr;
+
+ /* Validate xfs_attri_log_format before the large memory allocation */
+ len = sizeof(struct xfs_attri_log_format);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_attri_validate(mp, attri_formatp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ /* Validate the attr name */
+ if (item->ri_buf[1].i_len !=
+ xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[1].i_addr, item->ri_buf[1].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ /* Validate the attr value, if present */
+ if (attri_formatp->alfi_value_len != 0) {
+ if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr,
+ item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ attr_value = item->ri_buf[2].i_addr;
+ }
+
+ /*
+ * Memory alloc failure will cause replay to abort. We attach the
+ * name/value buffer to the recovered incore log item and drop our
+ * reference.
+ */
+ nv = xfs_attri_log_nameval_alloc(attr_name,
+ attri_formatp->alfi_name_len, attr_value,
+ attri_formatp->alfi_value_len);
+
+ attrip = xfs_attri_init(mp, nv);
+ memcpy(&attrip->attri_format, attri_formatp, len);
+
+ /*
+ * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+ * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+ * directly and drop the ATTRI reference. Note that
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+ xfs_attri_release(attrip);
+ xfs_attri_log_nameval_put(nv);
+ return 0;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip)
+{
+ struct xfs_attrd_log_item *attrdp;
+
+ ASSERT(tp != NULL);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ xfs_trans_add_item(tp, &attrdp->attrd_item);
+ return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ if (!intent)
+ return NULL;
+
+ return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attrd_log_format *attrd_formatp;
+
+ attrd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_ATTRI,
+ attrd_formatp->alfd_alf_id);
+ return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_attri_item_size,
+ .iop_format = xfs_attri_item_format,
+ .iop_unpin = xfs_attri_item_unpin,
+ .iop_release = xfs_attri_item_release,
+ .iop_recover = xfs_attri_item_recover,
+ .iop_match = xfs_attri_item_match,
+ .iop_relog = xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+ .item_type = XFS_LI_ATTRI,
+ .commit_pass2 = xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_attrd_item_size,
+ .iop_format = xfs_attrd_item_format,
+ .iop_release = xfs_attrd_item_release,
+ .iop_intent = xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+ .item_type = XFS_LI_ATTRD,
+ .commit_pass2 = xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
new file mode 100644
index 000000000000..3280a7930287
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef __XFS_ATTR_ITEM_H__
+#define __XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+struct xfs_attri_log_nameval {
+ struct xfs_log_iovec name;
+ struct xfs_log_iovec value;
+ refcount_t refcount;
+
+ /* name and value follow the end of this struct */
+};
+
+/*
+ * This is the "attr intention" log item. It is used to log the fact that some
+ * extended attribute operations need to be processed. An operation is
+ * currently either a set or remove. Set or remove operations are described by
+ * the xfs_attr_intent which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure. During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_intent until they are committed. They are freed
+ * when the xfs_attr_intent itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+ struct xfs_log_item attri_item;
+ atomic_t attri_refcount;
+ struct xfs_attri_log_nameval *attri_nameval;
+ struct xfs_attri_log_format attri_format;
+};
+
+/*
+ * This is the "attr done" log item. It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+ struct xfs_log_item attrd_item;
+ struct xfs_attri_log_item *attrd_attrip;
+ struct xfs_attrd_log_format attrd_format;
+};
+
+extern struct kmem_cache *xfs_attri_cache;
+extern struct kmem_cache *xfs_attrd_cache;
+
+#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d37743bdf274..99bbbe1a0e44 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
#include "xfs_attr_leaf.h"
@@ -44,7 +45,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
/*
* Copy out entries of shortform attribute lists for attr_list().
* Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
+ * If the output buffer is not large enough to hold them all, then
* we have to calculate each entries' hashvalue and sort them before
* we can begin returning them to the user.
*/
@@ -52,24 +53,18 @@ static int
xfs_attr_shortform_list(
struct xfs_attr_list_context *context)
{
- struct attrlist_cursor_kern *cursor;
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
+ struct xfs_inode *dp = context->dp;
struct xfs_attr_sf_sort *sbuf, *sbp;
struct xfs_attr_shortform *sf;
struct xfs_attr_sf_entry *sfe;
- struct xfs_inode *dp;
int sbsize, nsbuf, count, i;
int error = 0;
- ASSERT(context != NULL);
- dp = context->dp;
- ASSERT(dp != NULL);
- ASSERT(dp->i_afp != NULL);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
- cursor = context->cursor;
- ASSERT(cursor != NULL);
trace_xfs_attr_list_sf(context);
@@ -84,7 +79,7 @@ xfs_attr_shortform_list(
*/
if (context->bufsize == 0 ||
(XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+ (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
@@ -101,7 +96,7 @@ xfs_attr_shortform_list(
*/
if (context->seen_enough)
break;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
trace_xfs_attr_list_sf_all(context);
return 0;
@@ -125,7 +120,7 @@ xfs_attr_shortform_list(
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+ ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
@@ -141,7 +136,7 @@ xfs_attr_shortform_list(
/* These are bytes, and both on-disk, don't endian-flip */
sbp->valuelen = sfe->valuelen;
sbp->flags = sfe->flags;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
sbp++;
nsbuf++;
}
@@ -205,7 +200,7 @@ out:
STATIC int
xfs_attr_node_list_lookup(
struct xfs_attr_list_context *context,
- struct attrlist_cursor_kern *cursor,
+ struct xfs_attrlist_cursor_kern *cursor,
struct xfs_buf **pbp)
{
struct xfs_da3_icnode_hdr nodehdr;
@@ -279,7 +274,7 @@ xfs_attr_node_list_lookup(
return 0;
out_corruptbuf:
- xfs_buf_corruption_error(bp);
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
return -EFSCORRUPTED;
}
@@ -288,8 +283,8 @@ STATIC int
xfs_attr_node_list(
struct xfs_attr_list_context *context)
{
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_attr3_icleaf_hdr leafhdr;
- struct attrlist_cursor_kern *cursor;
struct xfs_attr_leafblock *leaf;
struct xfs_da_intnode *node;
struct xfs_buf *bp;
@@ -299,7 +294,6 @@ xfs_attr_node_list(
trace_xfs_attr_node_list(context);
- cursor = context->cursor;
cursor->initted = 1;
/*
@@ -394,7 +388,7 @@ xfs_attr3_leaf_list_int(
struct xfs_buf *bp,
struct xfs_attr_list_context *context)
{
- struct attrlist_cursor_kern *cursor;
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
@@ -408,7 +402,6 @@ xfs_attr3_leaf_list_int(
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- cursor = context->cursor;
cursor->initted = 1;
/*
@@ -452,8 +445,8 @@ xfs_attr3_leaf_list_int(
}
if ((entry->flags & XFS_ATTR_INCOMPLETE) &&
- !(context->flags & ATTR_INCOMPLETE))
- continue; /* skip incomplete entries */
+ !context->allow_incomplete)
+ continue;
if (entry->flags & XFS_ATTR_LOCAL) {
xfs_attr_leaf_name_local_t *name_loc;
@@ -488,14 +481,15 @@ xfs_attr3_leaf_list_int(
* Copy out attribute entries for attr_list(), for leaf attribute lists.
*/
STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+xfs_attr_leaf_list(
+ struct xfs_attr_list_context *context)
{
- int error;
- struct xfs_buf *bp;
+ struct xfs_buf *bp;
+ int error;
trace_xfs_attr_leaf_list(context);
- context->cursor->blkno = 0;
+ context->cursor.blkno = 0;
error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
if (error)
return error;
@@ -506,7 +500,7 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
}
int
-xfs_attr_list_int_ilocked(
+xfs_attr_list_ilocked(
struct xfs_attr_list_context *context)
{
struct xfs_inode *dp = context->dp;
@@ -518,151 +512,28 @@ xfs_attr_list_int_ilocked(
*/
if (!xfs_inode_hasattr(dp))
return 0;
- else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_list(context);
- else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ if (xfs_attr_is_leaf(dp))
return xfs_attr_leaf_list(context);
return xfs_attr_node_list(context);
}
int
-xfs_attr_list_int(
- xfs_attr_list_context_t *context)
+xfs_attr_list(
+ struct xfs_attr_list_context *context)
{
- int error;
- xfs_inode_t *dp = context->dp;
- uint lock_mode;
+ struct xfs_inode *dp = context->dp;
+ uint lock_mode;
+ int error;
XFS_STATS_INC(dp->i_mount, xs_attr_list);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
lock_mode = xfs_ilock_attr_map_shared(dp);
- error = xfs_attr_list_int_ilocked(context);
+ error = xfs_attr_list_ilocked(context);
xfs_iunlock(dp, lock_mode);
return error;
}
-
-#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
- (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \
- & ~(sizeof(uint32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-STATIC void
-xfs_attr_put_listent(
- xfs_attr_list_context_t *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
-{
- struct attrlist *alist = (struct attrlist *)context->alist;
- attrlist_ent_t *aep;
- int arraytop;
-
- ASSERT(!context->seen_enough);
- ASSERT(!(context->flags & ATTR_KERNOVAL));
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*alist));
- ASSERT(context->firstu <= context->bufsize);
-
- /*
- * Only list entries in the right namespace.
- */
- if (((context->flags & ATTR_SECURE) == 0) !=
- ((flags & XFS_ATTR_SECURE) == 0))
- return;
- if (((context->flags & ATTR_ROOT) == 0) !=
- ((flags & XFS_ATTR_ROOT) == 0))
- return;
-
- arraytop = sizeof(*alist) +
- context->count * sizeof(alist->al_offset[0]);
- context->firstu -= ATTR_ENTSIZE(namelen);
- if (context->firstu < arraytop) {
- trace_xfs_attr_list_full(context);
- alist->al_more = 1;
- context->seen_enough = 1;
- return;
- }
-
- aep = (attrlist_ent_t *)&context->alist[context->firstu];
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[namelen] = 0;
- alist->al_offset[context->count++] = context->firstu;
- alist->al_count = context->count;
- trace_xfs_attr_list_add(context);
- return;
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths. Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(
- xfs_inode_t *dp,
- char *buffer,
- int bufsize,
- int flags,
- attrlist_cursor_kern_t *cursor)
-{
- xfs_attr_list_context_t context;
- struct attrlist *alist;
- int error;
-
- /*
- * Validate the cursor.
- */
- if (cursor->pad1 || cursor->pad2)
- return -EINVAL;
- if ((cursor->initted == 0) &&
- (cursor->hashval || cursor->blkno || cursor->offset))
- return -EINVAL;
-
- /* Only internal consumers can retrieve incomplete attrs. */
- if (flags & ATTR_INCOMPLETE)
- return -EINVAL;
-
- /*
- * Check for a properly aligned buffer.
- */
- if (((long)buffer) & (sizeof(int)-1))
- return -EFAULT;
- if (flags & ATTR_KERNOVAL)
- bufsize = 0;
-
- /*
- * Initialize the output buffer.
- */
- memset(&context, 0, sizeof(context));
- context.dp = dp;
- context.cursor = cursor;
- context.resynch = 1;
- context.flags = flags;
- context.alist = buffer;
- context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
- context.firstu = context.bufsize;
- context.put_listent = xfs_attr_put_listent;
-
- alist = (struct attrlist *)context.alist;
- alist->al_count = 0;
- alist->al_more = 0;
- alist->al_offset[0] = context.bufsize;
-
- error = xfs_attr_list_int(&context);
- ASSERT(error <= 0);
- return error;
-}
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index e2148f2d5d6b..fe21c76f75b8 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -6,7 +6,7 @@
static inline unsigned int bio_max_vecs(unsigned int count)
{
- return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+ return bio_max_segs(howmany(count, PAGE_SIZE));
}
int
@@ -15,7 +15,7 @@ xfs_rw_bdev(
sector_t sector,
unsigned int count,
char *data,
- unsigned int op)
+ enum req_op op)
{
unsigned int is_vmalloc = is_vmalloc_addr(data);
@@ -26,10 +26,9 @@ xfs_rw_bdev(
if (is_vmalloc && op == REQ_OP_WRITE)
flush_kernel_vmap_range(data, count);
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = op | REQ_META | REQ_SYNC;
do {
struct page *page = kmem_to_page(data);
@@ -39,10 +38,9 @@ xfs_rw_bdev(
while (bio_add_page(bio, page, len, off) != len) {
struct bio *prev = bio;
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_copy_dev(bio, prev);
+ bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
- bio->bi_opf = prev->bi_opf;
bio_chain(prev, bio);
submit_bio(prev);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index ee6f4229cebc..41323da523d1 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -22,20 +22,25 @@
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
-kmem_zone_t *xfs_bui_zone;
-kmem_zone_t *xfs_bud_zone;
+struct kmem_cache *xfs_bui_cache;
+struct kmem_cache *xfs_bud_cache;
+
+static const struct xfs_item_ops xfs_bui_item_ops;
static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_bui_log_item, bui_item);
}
-void
+STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_cache_free(xfs_bui_zone, buip);
+ kmem_free(buip->bui_item.li_lv_shadow);
+ kmem_cache_free(xfs_bui_cache, buip);
}
/*
@@ -45,15 +50,16 @@ xfs_bui_item_free(
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the BUI.
*/
-void
+STATIC void
xfs_bui_release(
struct xfs_bui_log_item *buip)
{
ASSERT(atomic_read(&buip->bui_refcount) > 0);
- if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_bui_item_free(buip);
- }
+ if (!atomic_dec_and_test(&buip->bui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&buip->bui_item, 0);
+ xfs_bui_item_free(buip);
}
@@ -124,24 +130,17 @@ xfs_bui_item_release(
xfs_bui_release(BUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_bui_item_ops = {
- .iop_size = xfs_bui_item_size,
- .iop_format = xfs_bui_item_format,
- .iop_unpin = xfs_bui_item_unpin,
- .iop_release = xfs_bui_item_release,
-};
-
/*
* Allocate and initialize an bui item with the given number of extents.
*/
-struct xfs_bui_log_item *
+STATIC struct xfs_bui_log_item *
xfs_bui_init(
struct xfs_mount *mp)
{
struct xfs_bui_log_item *buip;
- buip = kmem_zone_zalloc(xfs_bui_zone, 0);
+ buip = kmem_cache_zalloc(xfs_bui_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -201,14 +200,24 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_cache_free(xfs_bud_zone, budp);
+ kmem_free(budp->bud_item.li_lv_shadow);
+ kmem_cache_free(xfs_bud_cache, budp);
+}
+
+static struct xfs_log_item *
+xfs_bud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &BUD_ITEM(lip)->bud_buip->bui_item;
}
static const struct xfs_item_ops xfs_bud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_bud_item_size,
.iop_format = xfs_bud_item_format,
.iop_release = xfs_bud_item_release,
+ .iop_intent = xfs_bud_item_intent,
};
static struct xfs_bud_log_item *
@@ -218,7 +227,7 @@ xfs_trans_get_bud(
{
struct xfs_bud_log_item *budp;
- budp = kmem_zone_zalloc(xfs_bud_zone, 0);
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
&xfs_bud_item_ops);
budp->bud_buip = buip;
@@ -257,7 +266,7 @@ xfs_trans_log_finish_bmap_update(
* 1.) releases the BUI and frees the BUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
@@ -267,8 +276,8 @@ xfs_trans_log_finish_bmap_update(
static int
xfs_bmap_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_bmap_intent *ba;
struct xfs_bmap_intent *bb;
@@ -278,27 +287,6 @@ xfs_bmap_update_diff_items(
return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/* Get an BUI. */
-STATIC void *
-xfs_bmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_bui_log_item *buip;
-
- ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- ASSERT(tp != NULL);
-
- buip = xfs_bui_init(tp->t_mountp);
- ASSERT(buip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &buip->bui_item);
- return buip;
-}
-
/* Set the map extent flags for this mapping. */
static void
xfs_trans_set_bmap_flags(
@@ -326,16 +314,12 @@ xfs_trans_set_bmap_flags(
STATIC void
xfs_bmap_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_bui_log_item *buip,
+ struct xfs_bmap_intent *bmap)
{
- struct xfs_bui_log_item *buip = intent;
- struct xfs_bmap_intent *bmap;
uint next_extent;
struct xfs_map_extent *map;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
@@ -355,23 +339,44 @@ xfs_bmap_update_log_item(
bmap->bi_bmap.br_state);
}
+static struct xfs_log_item *
+xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_bui_log_item *buip = xfs_bui_init(mp);
+ struct xfs_bmap_intent *bmap;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+
+ xfs_trans_add_item(tp, &buip->bui_item);
+ if (sort)
+ list_sort(mp, items, xfs_bmap_update_diff_items);
+ list_for_each_entry(bmap, items, bi_list)
+ xfs_bmap_update_log_item(tp, buip, bmap);
+ return &buip->bui_item;
+}
+
/* Get an BUD so we can process all the deferred rmap updates. */
-STATIC void *
+static struct xfs_log_item *
xfs_bmap_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_bud(tp, intent);
+ return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
}
/* Process a deferred rmap update. */
STATIC int
xfs_bmap_update_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_bmap_intent *bmap;
xfs_filblks_t count;
@@ -379,7 +384,7 @@ xfs_bmap_update_finish_item(
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
count = bmap->bi_bmap.br_blockcount;
- error = xfs_trans_log_finish_bmap_update(tp, done_item,
+ error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
bmap->bi_type,
bmap->bi_owner, bmap->bi_whichfork,
bmap->bi_bmap.br_startoff,
@@ -391,16 +396,16 @@ xfs_bmap_update_finish_item(
bmap->bi_bmap.br_blockcount = count;
return -EAGAIN;
}
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
return error;
}
/* Abort all pending BUIs. */
STATIC void
xfs_bmap_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_bui_release(intent);
+ xfs_bui_release(BUI_ITEM(intent));
}
/* Cancel a deferred rmap update. */
@@ -411,134 +416,125 @@ xfs_bmap_update_cancel_item(
struct xfs_bmap_intent *bmap;
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_free(bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bmap);
}
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_bmap_update_diff_items,
.create_intent = xfs_bmap_update_create_intent,
.abort_intent = xfs_bmap_update_abort_intent,
- .log_item = xfs_bmap_update_log_item,
.create_done = xfs_bmap_update_create_done,
.finish_item = xfs_bmap_update_finish_item,
.cancel_item = xfs_bmap_update_cancel_item,
};
-/*
- * Process a bmap update intent item that was recovered from the log.
- * We need to update some inode's bmbt.
- */
-int
-xfs_bui_recover(
- struct xfs_trans *parent_tp,
+/* Is this recovered BUI ok? */
+static inline bool
+xfs_bui_validate(
+ struct xfs_mount *mp,
struct xfs_bui_log_item *buip)
{
- int error = 0;
- unsigned int bui_type;
struct xfs_map_extent *bmap;
- xfs_fsblock_t startblock_fsb;
- xfs_fsblock_t inode_fsb;
- xfs_filblks_t count;
- bool op_ok;
- struct xfs_bud_log_item *budp;
- enum xfs_bmap_intent_type type;
- int whichfork;
- xfs_exntst_t state;
- struct xfs_trans *tp;
- struct xfs_inode *ip = NULL;
- struct xfs_bmbt_irec irec;
- struct xfs_mount *mp = parent_tp->t_mountp;
-
- ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
/* Only one mapping operation per BUI... */
- if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_bui_release(buip);
- return -EFSCORRUPTED;
- }
+ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+ return false;
- /*
- * First check the validity of the extent described by the
- * BUI. If anything is bad, then toss the BUI.
- */
bmap = &buip->bui_format.bui_extents[0];
- startblock_fsb = XFS_BB_TO_FSB(mp,
- XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
- inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
- XFS_INO_TO_FSB(mp, bmap->me_owner)));
+
+ if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
+ return false;
+
switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
- op_ok = true;
break;
default:
- op_ok = false;
- break;
- }
- if (!op_ok || startblock_fsb == 0 ||
- bmap->me_len == 0 ||
- inode_fsb == 0 ||
- startblock_fsb >= mp->m_sb.sb_dblocks ||
- bmap->me_len >= mp->m_sb.sb_agblocks ||
- inode_fsb >= mp->m_sb.sb_dblocks ||
- (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the BUI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_bui_release(buip);
- return -EFSCORRUPTED;
+ return false;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
- if (error)
- return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
- budp = xfs_trans_get_bud(tp, buip);
+ if (!xfs_verify_ino(mp, bmap->me_owner))
+ return false;
- /* Grab the inode. */
- error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
- if (error)
- goto err_inode;
+ if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len))
+ return false;
+
+ return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len);
+}
+
+/*
+ * Process a bmap update intent item that was recovered from the log.
+ * We need to update some inode's bmbt.
+ */
+STATIC int
+xfs_bui_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_bui_log_item *buip = BUI_ITEM(lip);
+ struct xfs_trans *tp;
+ struct xfs_inode *ip = NULL;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_map_extent *bmap;
+ struct xfs_bud_log_item *budp;
+ xfs_filblks_t count;
+ xfs_exntst_t state;
+ unsigned int bui_type;
+ int whichfork;
+ int iext_delta;
+ int error = 0;
- if (VFS_I(ip)->i_nlink == 0)
- xfs_iflags_set(ip, XFS_IRECOVERY);
+ if (!xfs_bui_validate(mp, buip)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &buip->bui_format, sizeof(buip->bui_format));
+ return -EFSCORRUPTED;
+ }
- /* Process deferred bmap item. */
+ bmap = &buip->bui_format.bui_extents[0];
state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- switch (bui_type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- type = bui_type;
- break;
- default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto err_inode;
- }
+
+ error = xlog_recover_iget(mp, bmap->me_owner, &ip);
+ if (error)
+ return error;
+
+ /* Allocate transaction and do the work. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ budp = xfs_trans_get_bud(tp, buip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ if (bui_type == XFS_BMAP_MAP)
+ iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
+ else
+ iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
+
+ error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ if (error)
+ goto err_cancel;
+
count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
- bmap->me_startoff, bmap->me_startblock, &count, state);
+ error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+ whichfork, bmap->me_startoff, bmap->me_startblock,
+ &count, state);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
+ sizeof(*bmap));
if (error)
- goto err_inode;
+ goto err_cancel;
if (count > 0) {
- ASSERT(type == XFS_BMAP_UNMAP);
+ ASSERT(bui_type == XFS_BMAP_UNMAP);
irec.br_startblock = bmap->me_startblock;
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
@@ -546,20 +542,171 @@ xfs_bui_recover(
xfs_bmap_unmap_extent(tp, ip, &irec);
}
- set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
+ /*
+ * Commit transaction, which frees the transaction and saves the inode
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ if (error)
+ goto err_unlock;
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
+ return 0;
+err_cancel:
+ xfs_trans_cancel(tp);
+err_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+err_rele:
+ xfs_irele(ip);
return error;
+}
-err_inode:
- xfs_defer_move(parent_tp, tp);
- xfs_trans_cancel(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
+STATIC bool
+xfs_bui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_bui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_bud_log_item *budp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = BUI_ITEM(intent)->bui_format.bui_nextents;
+ extp = BUI_ITEM(intent)->bui_format.bui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+ atomic_set(&buip->bui_next_extent, count);
+ xfs_trans_add_item(tp, &buip->bui_item);
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+ return &buip->bui_item;
+}
+
+static const struct xfs_item_ops xfs_bui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_bui_item_size,
+ .iop_format = xfs_bui_item_format,
+ .iop_unpin = xfs_bui_item_unpin,
+ .iop_release = xfs_bui_item_release,
+ .iop_recover = xfs_bui_item_recover,
+ .iop_match = xfs_bui_item_match,
+ .iop_relog = xfs_bui_item_relog,
+};
+
+static inline void
+xfs_bui_copy_format(
+ struct xfs_bui_log_format *dst,
+ const struct xfs_bui_log_format *src)
+{
+ unsigned int i;
+
+ memcpy(dst, src, offsetof(struct xfs_bui_log_format, bui_extents));
+
+ for (i = 0; i < src->bui_nextents; i++)
+ memcpy(&dst->bui_extents[i], &src->bui_extents[i],
+ sizeof(struct xfs_map_extent));
+}
+
+/*
+ * This routine is called to create an in-core extent bmap update
+ * item from the bui format structure which was logged on disk.
+ * It allocates an in-core bui, copies the extents from the format
+ * structure into it, and adds the bui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_bui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_bui_log_format *bui_formatp;
+ size_t len;
+
+ bui_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
}
- return error;
+
+ if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ buip = xfs_bui_init(mp);
+ xfs_bui_copy_format(&buip->bui_format, bui_formatp);
+ atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
+ xfs_bui_release(buip);
+ return 0;
}
+
+const struct xlog_recover_item_ops xlog_bui_item_ops = {
+ .item_type = XFS_LI_BUI,
+ .commit_pass2 = xlog_recover_bui_commit_pass2,
+};
+
+/*
+ * This routine is called when an BUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding BUI if it
+ * was still in the log. To do this it searches the AIL for the BUI with an id
+ * equal to that in the BUD format structure. If we find it we drop the BUD
+ * reference, which removes the BUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_bud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_bud_log_format *bud_formatp;
+
+ bud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_BUI, bud_formatp->bud_bui_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_bud_item_ops = {
+ .item_type = XFS_LI_BUD,
+ .commit_pass2 = xlog_recover_bud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index ad479cc73de8..3fafd3881a0b 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -25,7 +25,7 @@
/* kernel only BUI/BUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -33,11 +33,6 @@ struct kmem_zone;
#define XFS_BUI_MAX_FAST_EXTENTS 1
/*
- * Define BUI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_BUI_RECOVERED 1
-
-/*
* This is the "bmap update intent" log item. It is used to log the fact that
* some reverse mappings need to change. It is used in conjunction with the
* "bmap update done" log item described below.
@@ -49,7 +44,6 @@ struct xfs_bui_log_item {
struct xfs_log_item bui_item;
atomic_t bui_refcount;
atomic_t bui_next_extent;
- unsigned long bui_flags; /* misc flags */
struct xfs_bui_log_format bui_format;
};
@@ -71,12 +65,7 @@ struct xfs_bud_log_item {
struct xfs_bud_log_format bud_format;
};
-extern struct kmem_zone *xfs_bui_zone;
-extern struct kmem_zone *xfs_bud_zone;
-
-struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
-void xfs_bui_item_free(struct xfs_bui_log_item *);
-void xfs_bui_release(struct xfs_bui_log_item *);
-int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
+extern struct kmem_cache *xfs_bui_cache;
+extern struct kmem_cache *xfs_bud_cache;
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e62fb5216341..04d0c2bff67c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -71,18 +71,24 @@ xfs_zero_extent(
#ifdef CONFIG_XFS_RT
int
xfs_bmap_rtalloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+ struct xfs_bmalloca *ap)
{
- int error; /* error return value */
- xfs_mount_t *mp; /* mount point structure */
- xfs_extlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_rtblock_t rtb;
-
- mp = ap->ip->i_mount;
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_fileoff_t orig_offset = ap->offset;
+ xfs_rtblock_t rtb;
+ xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t mod = 0; /* product factor for allocators */
+ xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t orig_length = ap->length;
+ xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
+ xfs_extlen_t raminlen;
+ bool rtlocked = false;
+ bool ignore_locality = false;
+ int error;
+
align = xfs_get_extsz_hint(ap->ip);
+retry:
prod = align / mp->m_sb.sb_rextsize;
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
@@ -93,6 +99,15 @@ xfs_bmap_rtalloc(
ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
/*
+ * If we shifted the file offset downward to satisfy an extent size
+ * hint, increase minlen by that amount so that the allocator won't
+ * give us an allocation that's too short to cover at least one of the
+ * blocks that the caller asked for.
+ */
+ if (ap->offset != orig_offset)
+ minlen += orig_offset - ap->offset;
+
+ /*
* If the offset & length are not perfectly aligned
* then kill prod, it will just get us in trouble.
*/
@@ -104,29 +119,32 @@ xfs_bmap_rtalloc(
*/
ralen = ap->length / mp->m_sb.sb_rextsize;
/*
- * If the old value was close enough to MAXEXTLEN that
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+ if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+ ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
/*
* Lock out modifications to both the RT bitmap and summary inodes
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ if (!rtlocked) {
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ rtlocked = true;
+ }
/*
* If it's an allocation to an empty file at offset 0,
* pick an extent that will space things out in the rt area.
*/
if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+ xfs_rtblock_t rtx; /* realtime extent no */
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
@@ -141,33 +159,59 @@ xfs_bmap_rtalloc(
/*
* Realtime allocation, done through xfs_rtallocate_extent.
*/
- do_div(ap->blkno, mp->m_sb.sb_rextsize);
+ if (ignore_locality)
+ ap->blkno = 0;
+ else
+ do_div(ap->blkno, mp->m_sb.sb_rextsize);
rtb = ap->blkno;
ap->length = ralen;
- error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
- &ralen, ap->wasdel, prod, &rtb);
+ raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
+ error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
+ &ralen, ap->wasdel, prod, &rtb);
if (error)
return error;
- ap->blkno = rtb;
- if (ap->blkno != NULLFSBLOCK) {
- ap->blkno *= mp->m_sb.sb_rextsize;
- ralen *= mp->m_sb.sb_rextsize;
- ap->length = ralen;
- ap->ip->i_d.di_nblocks += ralen;
+ if (rtb != NULLRTBLOCK) {
+ ap->blkno = rtb * mp->m_sb.sb_rextsize;
+ ap->length = ralen * mp->m_sb.sb_rextsize;
+ ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
- ap->ip->i_delayed_blks -= ralen;
+ ap->ip->i_delayed_blks -= ap->length;
/*
* Adjust the disk quota also. This was reserved
* earlier.
*/
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
- } else {
- ap->length = 0;
+ XFS_TRANS_DQ_RTBCOUNT, ap->length);
+ return 0;
}
+
+ if (align > mp->m_sb.sb_rextsize) {
+ /*
+ * We previously enlarged the request length to try to satisfy
+ * an extent size hint. The allocator didn't return anything,
+ * so reset the parameters to the original values and try again
+ * without alignment criteria.
+ */
+ ap->offset = orig_offset;
+ ap->length = orig_length;
+ minlen = align = mp->m_sb.sb_rextsize;
+ goto retry;
+ }
+
+ if (!ignore_locality && ap->blkno != 0) {
+ /*
+ * If we can't allocate near a specific rt extent, try again
+ * without locality criteria.
+ */
+ ignore_locality = true;
+ goto retry;
+ }
+
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
return 0;
}
#endif /* CONFIG_XFS_RT */
@@ -212,7 +256,7 @@ xfs_bmap_count_blocks(
xfs_filblks_t *count)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
xfs_extlen_t btblocks = 0;
int error;
@@ -223,13 +267,11 @@ xfs_bmap_count_blocks(
if (!ifp)
return 0;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_BTREE:
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, whichfork);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
error = xfs_btree_count_blocks(cur, &btblocks);
@@ -244,7 +286,7 @@ xfs_bmap_count_blocks(
*/
*count += btblocks - 1;
- /* fall through */
+ fallthrough;
case XFS_DINODE_FMT_EXTENTS:
*nextents = xfs_bmap_count_leaves(ifp, count);
break;
@@ -397,33 +439,32 @@ xfs_getbmap(
whichfork = XFS_COW_FORK;
else
whichfork = XFS_DATA_FORK;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
switch (whichfork) {
case XFS_ATTR_FORK:
- if (!XFS_IFORK_Q(ip))
- goto out_unlock_iolock;
+ lock = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_has_attr_fork(ip))
+ goto out_unlock_ilock;
max_len = 1LL << 32;
- lock = xfs_ilock_attr_map_shared(ip);
break;
case XFS_COW_FORK:
+ lock = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lock);
+
/* No CoW fork? Just return */
- if (!ifp)
- goto out_unlock_iolock;
+ if (!xfs_ifork_ptr(ip, whichfork))
+ goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip))
max_len = mp->m_super->s_maxbytes;
else
max_len = XFS_ISIZE(ip);
-
- lock = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lock);
break;
case XFS_DATA_FORK:
if (!(iflags & BMV_IF_DELALLOC) &&
- (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
+ (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) {
error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (error)
goto out_unlock_iolock;
@@ -439,7 +480,7 @@ xfs_getbmap(
}
if (xfs_get_extsz_hint(ip) ||
- (ip->i_d.di_flags &
+ (ip->i_diflags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
max_len = mp->m_super->s_maxbytes;
else
@@ -449,7 +490,9 @@ xfs_getbmap(
break;
}
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ ifp = xfs_ifork_ptr(ip, whichfork);
+
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
break;
@@ -471,11 +514,9 @@ xfs_getbmap(
first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, whichfork);
- if (error)
- goto out_unlock_ilock;
- }
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ goto out_unlock_ilock;
if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
/*
@@ -558,7 +599,7 @@ xfs_bmap_punch_delalloc_range(
struct xfs_iext_cursor icur;
int error = 0;
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(!xfs_need_iread_extents(ifp));
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
@@ -597,8 +638,24 @@ out_unlock:
* regular files that are marked preallocated or append-only.
*/
bool
-xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+xfs_can_free_eofblocks(
+ struct xfs_inode *ip,
+ bool force)
{
+ struct xfs_bmbt_irec imap;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t end_fsb;
+ xfs_fileoff_t last_fsb;
+ int nimaps = 1;
+ int error;
+
+ /*
+ * Caller must either hold the exclusive io lock; or be inactivating
+ * the inode, which guarantees there are no other users of the inode.
+ */
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
+ (VFS_I(ip)->i_state & I_FREEING));
+
/* prealloc/delalloc exists only on regular files */
if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
@@ -613,18 +670,45 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
return false;
/* If we haven't read in the extent list, then don't do it now. */
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+ if (xfs_need_iread_extents(&ip->i_df))
return false;
/*
* Do not free real preallocated or append-only files unless the file
* has delalloc blocks and we are forced to remove them.
*/
- if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
if (!force || ip->i_delayed_blks == 0)
return false;
- return true;
+ /*
+ * Do not try to free post-EOF blocks if EOF is beyond the end of the
+ * range supported by the page cache, because the truncation will loop
+ * forever.
+ */
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+ end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+ last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+ if (last_fsb <= end_fsb)
+ return false;
+
+ /*
+ * Look up the mapping for the first block past EOF. If we can't find
+ * it, there's nothing to free.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps,
+ 0);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error || nimaps == 0)
+ return false;
+
+ /*
+ * If there's a real mapping there or there are delayed allocation
+ * reservations, then we have post-EOF blocks to try to free.
+ */
+ return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks;
}
/*
@@ -637,78 +721,52 @@ xfs_free_eofblocks(
struct xfs_inode *ip)
{
struct xfs_trans *tp;
- int error;
- xfs_fileoff_t end_fsb;
- xfs_fileoff_t last_fsb;
- xfs_filblks_t map_len;
- int nimaps;
- struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
+ int error;
- /*
- * Figure out if there are any blocks beyond the end
- * of the file. If not, then there is nothing to do.
- */
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
- last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- if (last_fsb <= end_fsb)
- return 0;
- map_len = last_fsb - end_fsb;
+ /* Attach the dquots to the inode up front. */
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
- nimaps = 1;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ /* Wait on dio to ensure i_size has settled. */
+ inode_dio_wait(VFS_I(ip));
- /*
- * If there are blocks after the end of file, truncate the file to its
- * current size to free them up.
- */
- if (!error && (nimaps != 0) &&
- (imap.br_startblock != HOLESTARTBLOCK ||
- ip->i_delayed_blks)) {
- /*
- * Attach the dquots to the inode up front.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ if (error) {
+ ASSERT(xfs_is_shutdown(mp));
+ return error;
+ }
- /* wait on dio to ensure i_size has settled */
- inode_dio_wait(VFS_I(ip));
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
- &tp);
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- return error;
- }
+ /*
+ * Do not update the on-disk file size. If we update the on-disk file
+ * size and then the system crashes before the contents of the file are
+ * flushed to disk then the files may be full of holes (ie NULL files
+ * bug).
+ */
+ error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
+ if (error)
+ goto err_cancel;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_unlock;
- /*
- * Do not update the on-disk file size. If we update the
- * on-disk file size and then the system crashes before the
- * contents of the file are flushed to disk then the files
- * may be full of holes (ie NULL files bug).
- */
- error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
- XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
- if (error) {
- /*
- * If we get an error at this point we simply don't
- * bother truncating the file.
- */
- xfs_trans_cancel(tp);
- } else {
- error = xfs_trans_commit(tp);
- if (!error)
- xfs_inode_clear_eofblocks_tag(ip);
- }
+ xfs_inode_clear_eofblocks_tag(ip);
+ goto out_unlock;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+err_cancel:
+ /*
+ * If we get an error at this point we simply don't
+ * bother truncating the file.
+ */
+ xfs_trans_cancel(tp);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -716,8 +774,7 @@ int
xfs_alloc_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len,
- int alloc_type)
+ xfs_off_t len)
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
@@ -727,16 +784,14 @@ xfs_alloc_file_space(
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
int nimaps;
- int quota_flag;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
- uint qblocks, resblks, resrtextents;
int error;
trace_xfs_alloc_file_space(ip);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(ip);
@@ -761,6 +816,7 @@ xfs_alloc_file_space(
*/
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
+ unsigned int dblocks, rblocks, resblks;
/*
* Determine space reservations for data/realtime.
@@ -786,53 +842,41 @@ xfs_alloc_file_space(
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow. We can't allocate
* more than @nimaps extents, and an extent is limited on disk
- * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+ * limit.
*/
- resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ resblks = min_t(xfs_fileoff_t, (e - s),
+ (XFS_MAX_BMBT_EXTLEN * nimaps));
if (unlikely(rt)) {
- resrtextents = qblocks = resblks;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resblks;
} else {
- resrtextents = 0;
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ rblocks = 0;
}
- /*
- * Allocate and setup the transaction.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
- resrtextents, 0, &tp);
-
- /*
- * Check for running out of space
- */
- if (error) {
- /*
- * Free the transaction structure.
- */
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- break;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
- 0, quota_flag);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ dblocks, rblocks, false, &tp);
if (error)
- goto error1;
+ break;
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto error;
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, alloc_type, 0, imapp,
- &nimaps);
+ allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+ &nimaps);
if (error)
- goto error0;
+ goto error;
+
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- /*
- * Complete the transaction
- */
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
@@ -851,10 +895,7 @@ xfs_alloc_file_space(
return error;
-error0: /* unlock inode, unreserve quota blocks, cancel trans */
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
-
-error1: /* Just cancel transaction */
+error:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -872,20 +913,18 @@ xfs_unmap_extent(
uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- if (error) {
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
+ if (error)
return error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
- ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
if (error)
goto out_trans_cancel;
@@ -912,7 +951,7 @@ xfs_flush_unmap_range(
xfs_off_t rounding, start, end;
int error;
- rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+ rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
start = round_down(offset, rounding);
end = round_up(offset + len, rounding) - 1;
@@ -946,6 +985,14 @@ xfs_free_file_space(
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+ /* We can only free complete realtime extents. */
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+ startoffset_fsb = roundup_64(startoffset_fsb,
+ mp->m_sb.sb_rextsize);
+ endoffset_fsb = rounddown_64(endoffset_fsb,
+ mp->m_sb.sb_rextsize);
+ }
+
/*
* Need to zero the stuff we're not freeing, on disk.
*/
@@ -960,7 +1007,7 @@ xfs_free_file_space(
/*
* Now that we've unmap all full blocks we'll have to zero out any
- * partial block at the beginning and/or end. iomap_zero_range is smart
+ * partial block at the beginning and/or end. xfs_zero_range is smart
* enough to skip any holes, including those we just created, but we
* must take care not to zero beyond EOF and enlarge i_size.
*/
@@ -968,15 +1015,14 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, offset, len, NULL);
if (error)
return error;
/*
* If we zeroed right up to EOF and EOF straddles a page boundary we
* must make sure that the post-EOF area is also zeroed because the
- * page could be mmap'd and iomap_zero_range doesn't do that for us.
+ * page could be mmap'd and xfs_zero_range doesn't do that for us.
* Writeback of the eof page will do this, albeit clumsily.
*/
if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
@@ -1012,9 +1058,9 @@ xfs_prepare_shift(
* extent (after split) during the shift and corrupt the file. Start
* with the block just prior to the start to stabilize the boundary.
*/
- offset = round_down(offset, 1 << mp->m_sb.sb_blocklog);
+ offset = round_down(offset, mp->m_sb.sb_blocksize);
if (offset)
- offset -= (1 << mp->m_sb.sb_blocklog);
+ offset -= mp->m_sb.sb_blocksize;
/*
* Writeback and invalidate cache for the remainder of the file as we're
@@ -1062,7 +1108,6 @@ xfs_collapse_file_space(
int error;
xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
- uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
bool done = false;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1078,32 +1123,34 @@ xfs_collapse_file_space(
if (error)
return error;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
- &tp);
- if (error)
- break;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
- ip->i_gdquot, ip->i_pdquot, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+ while (!done) {
error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
&done);
if (error)
goto out_trans_cancel;
+ if (done)
+ break;
- error = xfs_trans_commit(tp);
+ /* finish any deferred frees and roll the transaction */
+ error = xfs_defer_finish(&tp);
+ if (error)
+ goto out_trans_cancel;
}
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1146,35 +1193,48 @@ xfs_insert_file_space(
if (error)
return error;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error)
+ goto out_trans_cancel;
+
/*
* The extent shifting code works on extent granularity. So, if stop_fsb
* is not the starting block of extent, we need to split the extent at
* stop_fsb.
*/
- error = xfs_bmap_split_extent(ip, stop_fsb);
+ error = xfs_bmap_split_extent(tp, ip, stop_fsb);
if (error)
- return error;
+ goto out_trans_cancel;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0,
- &tp);
+ do {
+ error = xfs_defer_finish(&tp);
if (error)
- break;
+ goto out_trans_cancel;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
&done, stop_fsb);
if (error)
goto out_trans_cancel;
+ } while (!done);
- error = xfs_trans_commit(tp);
- }
-
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1203,24 +1263,33 @@ xfs_swap_extents_check_format(
struct xfs_inode *ip, /* target inode */
struct xfs_inode *tip) /* tmp inode */
{
+ struct xfs_ifork *ifp = &ip->i_df;
+ struct xfs_ifork *tifp = &tip->i_df;
+
+ /* User/group/project quota ids must match if quotas are enforced. */
+ if (XFS_IS_QUOTA_ON(ip->i_mount) &&
+ (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
+ !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
+ ip->i_projid != tip->i_projid))
+ return -EINVAL;
/* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
+ tifp->if_format == XFS_DINODE_FMT_LOCAL)
return -EINVAL;
/*
* if the target inode has less extents that then temporary inode then
* why did userspace call us?
*/
- if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ if (ifp->if_nextents < tifp->if_nextents)
return -EINVAL;
/*
* If we have to use the (expensive) rmap swap method, we can
* handle any number of extents and any format.
*/
- if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
+ if (xfs_has_rmapbt(ip->i_mount))
return 0;
/*
@@ -1228,20 +1297,18 @@ xfs_swap_extents_check_format(
* form then we will end up with the target inode in the wrong format
* as we already know there are less extents in the temp inode.
*/
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ tifp->if_format == XFS_DINODE_FMT_BTREE)
return -EINVAL;
/* Check temp in extent form to max in target */
- if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
- XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
/* Check target in extent form to max in temp */
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
- XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
/*
@@ -1253,22 +1320,20 @@ xfs_swap_extents_check_format(
* (a common defrag case) which will occur when the temp inode is in
* extent format...
*/
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_Q(ip) &&
- XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+ if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (xfs_inode_has_attr_fork(ip) &&
+ XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
return -EINVAL;
- if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
- XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
}
/* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_Q(tip) &&
- XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+ if (xfs_inode_has_attr_fork(tip) &&
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
return -EINVAL;
- if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
- XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
}
@@ -1320,9 +1385,9 @@ xfs_swap_extent_rmap(
* rmap functions when we go to fix up the rmaps. The flags
* will be switch for reals later.
*/
- tip_flags2 = tip->i_d.di_flags2;
- if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
- tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ tip_flags2 = tip->i_diflags2;
+ if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
+ tip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
offset_fsb = 0;
end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
@@ -1364,6 +1429,28 @@ xfs_swap_extent_rmap(
irec.br_blockcount);
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+ if (xfs_bmap_is_real_extent(&uirec)) {
+ error = xfs_iext_count_may_overflow(ip,
+ XFS_DATA_FORK,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error)
+ goto out;
+ }
+
+ if (xfs_bmap_is_real_extent(&irec)) {
+ error = xfs_iext_count_may_overflow(tip,
+ XFS_DATA_FORK,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error)
+ goto out;
+ }
+
/* Remove the mapping from the donor file. */
xfs_bmap_unmap_extent(tp, tip, &uirec);
@@ -1393,12 +1480,12 @@ xfs_swap_extent_rmap(
offset_fsb += ilen;
}
- tip->i_d.di_flags2 = tip_flags2;
+ tip->i_diflags2 = tip_flags2;
return 0;
out:
trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
- tip->i_d.di_flags2 = tip_flags2;
+ tip->i_diflags2 = tip_flags2;
return error;
}
@@ -1420,15 +1507,15 @@ xfs_swap_extent_forks(
/*
* Count the number of extended attribute blocks
*/
- if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
- (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
&aforkblks);
if (error)
return error;
}
- if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
- (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
+ tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
&taforkblks);
if (error)
@@ -1442,12 +1529,12 @@ xfs_swap_extent_forks(
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
- if (ip->i_d.di_version == 3 &&
- ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*target_log_flags) |= XFS_ILOG_DOWNER;
- if (tip->i_d.di_version == 3 &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*src_log_flags) |= XFS_ILOG_DOWNER;
+ if (xfs_has_v3inodes(ip->i_mount)) {
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
+ (*target_log_flags) |= XFS_ILOG_DOWNER;
+ if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
+ (*src_log_flags) |= XFS_ILOG_DOWNER;
+ }
/*
* Swap the data forks of the inodes
@@ -1457,12 +1544,9 @@ xfs_swap_extent_forks(
/*
* Fix the on-disk inode values
*/
- tmp = (uint64_t)ip->i_d.di_nblocks;
- ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
- tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
-
- swap(ip->i_d.di_nextents, tip->i_d.di_nextents);
- swap(ip->i_d.di_format, tip->i_d.di_format);
+ tmp = (uint64_t)ip->i_nblocks;
+ ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
+ tip->i_nblocks = tmp + taforkblks - aforkblks;
/*
* The extents in the source inode could still contain speculative
@@ -1477,24 +1561,24 @@ xfs_swap_extent_forks(
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_d.di_version < 3 ||
+ ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
(*src_log_flags & XFS_ILOG_DOWNER));
(*src_log_flags) |= XFS_ILOG_DBROOT;
break;
}
- switch (tip->i_d.di_format) {
+ switch (tip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
(*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
(*target_log_flags) |= XFS_ILOG_DBROOT;
- ASSERT(tip->i_d.di_version < 3 ||
+ ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
(*target_log_flags & XFS_ILOG_DOWNER));
break;
}
@@ -1555,9 +1639,9 @@ xfs_swap_extents(
struct xfs_bstat *sbp = &sxp->sx_stat;
int src_log_flags, target_log_flags;
int error = 0;
- int lock_flags;
uint64_t f;
int resblks = 0;
+ unsigned int flags = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1566,8 +1650,8 @@ xfs_swap_extents(
* do the rest of the checks.
*/
lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
- lock_flags = XFS_MMAPLOCK_EXCL;
- xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL);
+ filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
+ VFS_I(tip)->i_mapping);
/* Verify that both files have the same format */
if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
@@ -1599,7 +1683,7 @@ xfs_swap_extents(
if (xfs_inode_has_cow_data(tip)) {
error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
if (error)
- return error;
+ goto out_unlock;
}
/*
@@ -1607,10 +1691,10 @@ xfs_swap_extents(
* a block reservation because it's really just a remap operation
* performed with log redo items!
*/
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
- int w = XFS_DATA_FORK;
- uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w);
- uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w);
+ if (xfs_has_rmapbt(mp)) {
+ int w = XFS_DATA_FORK;
+ uint32_t ipnext = ip->i_df.if_nextents;
+ uint32_t tipnext = tip->i_df.if_nextents;
/*
* Conceptually this shouldn't affect the shape of either bmbt,
@@ -1621,17 +1705,16 @@ xfs_swap_extents(
resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
/*
- * Handle the corner case where either inode might straddle the
- * btree format boundary. If so, the inode could bounce between
- * btree <-> extent format on unmap -> remap cycles, freeing and
- * allocating a bmapbt block each time.
+ * If either inode straddles a bmapbt block allocation boundary,
+ * the rmapbt algorithm triggers repeated allocs and frees as
+ * extents are remapped. This can exhaust the block reservation
+ * prematurely and cause shutdown. Return freed blocks to the
+ * transaction reservation to counter this behavior.
*/
- if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
- resblks += XFS_IFORK_MAXEXT(ip, w);
- if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
- resblks += XFS_IFORK_MAXEXT(tip, w);
+ flags |= XFS_TRANS_RES_FDBLKS;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
+ &tp);
if (error)
goto out_unlock;
@@ -1640,15 +1723,14 @@ xfs_swap_extents(
* or cancel will unlock the inodes from this point onwards.
*/
xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
- lock_flags |= XFS_ILOCK_EXCL;
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_ijoin(tp, tip, 0);
/* Verify all data are being swapped */
if (sxp->sx_offset != 0 ||
- sxp->sx_length != ip->i_d.di_size ||
- sxp->sx_length != tip->i_d.di_size) {
+ sxp->sx_length != ip->i_disk_size ||
+ sxp->sx_length != tip->i_disk_size) {
error = -EFAULT;
goto out_trans_cancel;
}
@@ -1690,7 +1772,7 @@ xfs_swap_extents(
src_log_flags = XFS_ILOG_CORE;
target_log_flags = XFS_ILOG_CORE;
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_has_rmapbt(mp))
error = xfs_swap_extent_rmap(&tp, ip, tip);
else
error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
@@ -1699,21 +1781,22 @@ xfs_swap_extents(
goto out_trans_cancel;
/* Do we have to swap reflink flags? */
- if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^
- (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) {
- f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
- ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
- tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
- tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+ if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
+ (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
+ f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
+ tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
}
/* Swap the cow forks. */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
- ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+ if (xfs_has_reflink(mp)) {
+ ASSERT(!ip->i_cowfp ||
+ ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(!tip->i_cowfp ||
+ tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
- swap(ip->i_cnextents, tip->i_cnextents);
swap(ip->i_cowfp, tip->i_cowfp);
if (ip->i_cowfp && ip->i_cowfp->if_bytes)
@@ -1750,7 +1833,7 @@ xfs_swap_extents(
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -1758,13 +1841,16 @@ xfs_swap_extents(
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
+out_unlock_ilock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL);
out_unlock:
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
+ filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
+ VFS_I(tip)->i_mapping);
unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
- goto out_unlock;
+ goto out_unlock_ilock;
}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 9f993168b55b..24b37d211f1d 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len, int alloc_type);
+ xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 217e4f82a44a..dde346450952 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -5,22 +5,24 @@
*/
#include "xfs.h"
#include <linux/backing-dev.h>
+#include <linux/dax.h>
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_log_recover.h"
+#include "xfs_log_priv.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_ag.h"
-static kmem_zone_t *xfs_buf_zone;
-
-#define xb_to_gfp(flags) \
- ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
@@ -40,7 +42,7 @@ static kmem_zone_t *xfs_buf_zone;
* pag_buf_lock
* lru_lock
*
- * xfs_buftarg_wait_rele
+ * xfs_buftarg_drain_rele
* lru_lock
* b_lock (trylock due to inversion)
*
@@ -49,6 +51,15 @@ static kmem_zone_t *xfs_buf_zone;
* b_lock (trylock due to inversion)
*/
+static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
+
+static inline int
+xfs_buf_submit(
+ struct xfs_buf *bp)
+{
+ return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -67,7 +78,7 @@ static inline int
xfs_buf_vmap_len(
struct xfs_buf *bp)
{
- return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+ return (bp->b_page_count * PAGE_SIZE);
}
/*
@@ -76,7 +87,7 @@ xfs_buf_vmap_len(
* because the corresponding decrement is deferred to buffer release. Buffers
* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
* tracking adds unnecessary overhead. This is used for sychronization purposes
- * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
* in-flight buffers.
*
* Buffers that are never released (e.g., superblock, iclog buffers) must set
@@ -211,9 +222,7 @@ _xfs_buf_alloc(
int i;
*bpp = NULL;
- bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
- if (unlikely(!bp))
- return -ENOMEM;
+ bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -240,11 +249,11 @@ _xfs_buf_alloc(
*/
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
- kmem_cache_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_cache, bp);
return error;
}
- bp->b_bn = map[0].bm_bn;
+ bp->b_rhash_key = map[0].bm_bn;
bp->b_length = 0;
for (i = 0; i < nmaps; i++) {
bp->b_maps[i].bm_bn = map[i].bm_bn;
@@ -262,187 +271,143 @@ _xfs_buf_alloc(
return 0;
}
-/*
- * Allocate a page array capable of holding a specified number
- * of pages, and point the page buf at it.
- */
-STATIC int
-_xfs_buf_get_pages(
- xfs_buf_t *bp,
- int page_count)
+static void
+xfs_buf_free_pages(
+ struct xfs_buf *bp)
{
- /* Make sure that we have a page list */
- if (bp->b_pages == NULL) {
- bp->b_page_count = page_count;
- if (page_count <= XB_PAGES) {
- bp->b_pages = bp->b_page_array;
- } else {
- bp->b_pages = kmem_alloc(sizeof(struct page *) *
- page_count, KM_NOFS);
- if (bp->b_pages == NULL)
- return -ENOMEM;
- }
- memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+ uint i;
+
+ ASSERT(bp->b_flags & _XBF_PAGES);
+
+ if (xfs_buf_is_vmapped(bp))
+ vm_unmap_ram(bp->b_addr, bp->b_page_count);
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ if (bp->b_pages[i])
+ __free_page(bp->b_pages[i]);
}
- return 0;
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += bp->b_page_count;
+
+ if (bp->b_pages != bp->b_page_array)
+ kmem_free(bp->b_pages);
+ bp->b_pages = NULL;
+ bp->b_flags &= ~_XBF_PAGES;
}
-/*
- * Frees b_pages if it was allocated.
- */
-STATIC void
-_xfs_buf_free_pages(
- xfs_buf_t *bp)
+static void
+xfs_buf_free_callback(
+ struct callback_head *cb)
{
- if (bp->b_pages != bp->b_page_array) {
- kmem_free(bp->b_pages);
- bp->b_pages = NULL;
- }
+ struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
+
+ xfs_buf_free_maps(bp);
+ kmem_cache_free(xfs_buf_cache, bp);
}
-/*
- * Releases the specified buffer.
- *
- * The modification state of any associated pages is left unchanged.
- * The buffer must not be on any hash - use xfs_buf_rele instead for
- * hashed and refcounted buffers
- */
static void
xfs_buf_free(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
- if (bp->b_flags & _XBF_PAGES) {
- uint i;
+ if (bp->b_flags & _XBF_PAGES)
+ xfs_buf_free_pages(bp);
+ else if (bp->b_flags & _XBF_KMEM)
+ kmem_free(bp->b_addr);
- if (xfs_buf_is_vmapped(bp))
- vm_unmap_ram(bp->b_addr - bp->b_offset,
- bp->b_page_count);
+ call_rcu(&bp->b_rcu, xfs_buf_free_callback);
+}
- for (i = 0; i < bp->b_page_count; i++) {
- struct page *page = bp->b_pages[i];
+static int
+xfs_buf_alloc_kmem(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ xfs_km_flags_t kmflag_mask = KM_NOFS;
+ size_t size = BBTOB(bp->b_length);
- __free_page(page);
- }
- } else if (bp->b_flags & _XBF_KMEM)
+ /* Assure zeroed buffer for non-read cases. */
+ if (!(flags & XBF_READ))
+ kmflag_mask |= KM_ZERO;
+
+ bp->b_addr = kmem_alloc(size, kmflag_mask);
+ if (!bp->b_addr)
+ return -ENOMEM;
+
+ if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
+ ((unsigned long)bp->b_addr & PAGE_MASK)) {
+ /* b_addr spans two pages - use alloc_page instead */
kmem_free(bp->b_addr);
- _xfs_buf_free_pages(bp);
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_zone, bp);
+ bp->b_addr = NULL;
+ return -ENOMEM;
+ }
+ bp->b_offset = offset_in_page(bp->b_addr);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = kmem_to_page(bp->b_addr);
+ bp->b_page_count = 1;
+ bp->b_flags |= _XBF_KMEM;
+ return 0;
}
-/*
- * Allocates all the pages for buffer in question and builds it's page list.
- */
-STATIC int
-xfs_buf_allocate_memory(
- xfs_buf_t *bp,
- uint flags)
-{
- size_t size;
- size_t nbytes, offset;
- gfp_t gfp_mask = xb_to_gfp(flags);
- unsigned short page_count, i;
- xfs_off_t start, end;
- int error;
- xfs_km_flags_t kmflag_mask = 0;
+static int
+xfs_buf_alloc_pages(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ gfp_t gfp_mask = __GFP_NOWARN;
+ long filled = 0;
- /*
- * assure zeroed buffer for non-read cases.
- */
- if (!(flags & XBF_READ)) {
- kmflag_mask |= KM_ZERO;
- gfp_mask |= __GFP_ZERO;
+ if (flags & XBF_READ_AHEAD)
+ gfp_mask |= __GFP_NORETRY;
+ else
+ gfp_mask |= GFP_NOFS;
+
+ /* Make sure that we have a page list */
+ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+ if (bp->b_page_count <= XB_PAGES) {
+ bp->b_pages = bp->b_page_array;
+ } else {
+ bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
+ gfp_mask);
+ if (!bp->b_pages)
+ return -ENOMEM;
}
+ bp->b_flags |= _XBF_PAGES;
+
+ /* Assure zeroed buffer for non-read cases. */
+ if (!(flags & XBF_READ))
+ gfp_mask |= __GFP_ZERO;
/*
- * for buffers that are contained within a single page, just allocate
- * the memory from the heap - there's no need for the complexity of
- * page arrays to keep allocation down to order 0.
+ * Bulk filling of pages can take multiple calls. Not filling the entire
+ * array is not an allocation failure, so don't back off if we get at
+ * least one extra page.
*/
- size = BBTOB(bp->b_length);
- if (size < PAGE_SIZE) {
- int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
- bp->b_addr = kmem_alloc_io(size, align_mask,
- KM_NOFS | kmflag_mask);
- if (!bp->b_addr) {
- /* low memory - use alloc_page loop instead */
- goto use_alloc_page;
- }
+ for (;;) {
+ long last = filled;
- if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
- ((unsigned long)bp->b_addr & PAGE_MASK)) {
- /* b_addr spans two pages - use alloc_page instead */
- kmem_free(bp->b_addr);
- bp->b_addr = NULL;
- goto use_alloc_page;
+ filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
+ bp->b_pages);
+ if (filled == bp->b_page_count) {
+ XFS_STATS_INC(bp->b_mount, xb_page_found);
+ break;
}
- bp->b_offset = offset_in_page(bp->b_addr);
- bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = kmem_to_page(bp->b_addr);
- bp->b_page_count = 1;
- bp->b_flags |= _XBF_KMEM;
- return 0;
- }
-use_alloc_page:
- start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
- end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
- page_count = end - start;
- error = _xfs_buf_get_pages(bp, page_count);
- if (unlikely(error))
- return error;
-
- offset = bp->b_offset;
- bp->b_flags |= _XBF_PAGES;
-
- for (i = 0; i < bp->b_page_count; i++) {
- struct page *page;
- uint retries = 0;
-retry:
- page = alloc_page(gfp_mask);
- if (unlikely(page == NULL)) {
- if (flags & XBF_READ_AHEAD) {
- bp->b_page_count = i;
- error = -ENOMEM;
- goto out_free_pages;
- }
+ if (filled != last)
+ continue;
- /*
- * This could deadlock.
- *
- * But until all the XFS lowlevel code is revamped to
- * handle buffer allocation failures we can't do much.
- */
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
- current->comm, current->pid,
- __func__, gfp_mask);
-
- XFS_STATS_INC(bp->b_mount, xb_page_retries);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry;
+ if (flags & XBF_READ_AHEAD) {
+ xfs_buf_free_pages(bp);
+ return -ENOMEM;
}
- XFS_STATS_INC(bp->b_mount, xb_page_found);
-
- nbytes = min_t(size_t, size, PAGE_SIZE - offset);
- size -= nbytes;
- bp->b_pages[i] = page;
- offset = 0;
+ XFS_STATS_INC(bp->b_mount, xb_page_retries);
+ memalloc_retry_wait(gfp_mask);
}
return 0;
-
-out_free_pages:
- for (i = 0; i < bp->b_page_count; i++)
- __free_page(bp->b_pages[i]);
- bp->b_flags &= ~_XBF_PAGES;
- return error;
}
/*
@@ -450,13 +415,13 @@ out_free_pages:
*/
STATIC int
_xfs_buf_map_pages(
- xfs_buf_t *bp,
- uint flags)
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
/* A single page buffer is always mappable */
- bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+ bp->b_addr = page_address(bp->b_pages[0]);
} else if (flags & XBF_UNMAPPED) {
bp->b_addr = NULL;
} else {
@@ -474,7 +439,7 @@ _xfs_buf_map_pages(
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
- -1, PAGE_KERNEL);
+ -1);
if (bp->b_addr)
break;
vm_unmap_aliases();
@@ -483,7 +448,6 @@ _xfs_buf_map_pages(
if (!bp->b_addr)
return -ENOMEM;
- bp->b_addr += bp->b_offset;
}
return 0;
@@ -506,7 +470,7 @@ _xfs_buf_obj_cmp(
*/
BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
- if (bp->b_bn != map->bm_bn)
+ if (bp->b_rhash_key != map->bm_bn)
return 1;
if (unlikely(bp->b_length != map->bm_len)) {
@@ -528,7 +492,7 @@ static const struct rhashtable_params xfs_buf_hash_params = {
.min_size = 32, /* empty AGs have minimal footprint */
.nelem_hint = 16,
.key_len = sizeof(xfs_daddr_t),
- .key_offset = offsetof(struct xfs_buf, b_bn),
+ .key_offset = offsetof(struct xfs_buf, b_rhash_key),
.head_offset = offsetof(struct xfs_buf, b_rhash_head),
.automatic_shrinking = true,
.obj_cmpfn = _xfs_buf_obj_cmp,
@@ -549,100 +513,45 @@ xfs_buf_hash_destroy(
rhashtable_destroy(&pag->pag_buf_hash);
}
-/*
- * Look up a buffer in the buffer cache and return it referenced and locked
- * in @found_bp.
- *
- * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
- * cache.
- *
- * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
- * -EAGAIN if we fail to lock it.
- *
- * Return values are:
- * -EFSCORRUPTED if have been supplied with an invalid address
- * -EAGAIN on trylock failure
- * -ENOENT if we fail to find a match and @new_bp was NULL
- * 0, with @found_bp:
- * - @new_bp if we inserted it into the cache
- * - the buffer we found and locked.
- */
static int
-xfs_buf_find(
+xfs_buf_map_verify(
struct xfs_buftarg *btp,
- struct xfs_buf_map *map,
- int nmaps,
- xfs_buf_flags_t flags,
- struct xfs_buf *new_bp,
- struct xfs_buf **found_bp)
+ struct xfs_buf_map *map)
{
- struct xfs_perag *pag;
- xfs_buf_t *bp;
- struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int i;
-
- *found_bp = NULL;
-
- for (i = 0; i < nmaps; i++)
- cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
+ if (map->bm_bn < 0 || map->bm_bn >= eofs) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
- __func__, cmap.bm_bn, eofs);
+ __func__, map->bm_bn, eofs);
WARN_ON(1);
return -EFSCORRUPTED;
}
-
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
-
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
- xfs_buf_hash_params);
- if (bp) {
- atomic_inc(&bp->b_hold);
- goto found;
- }
-
- /* No match found */
- if (!new_bp) {
- XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
- return -ENOENT;
- }
-
- /* the buffer keeps the perag reference until it is freed */
- new_bp->b_pag = pag;
- rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- *found_bp = new_bp;
return 0;
+}
-found:
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
-
- if (!xfs_buf_trylock(bp)) {
- if (flags & XBF_TRYLOCK) {
- xfs_buf_rele(bp);
- XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
+static int
+xfs_buf_find_lock(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ if (flags & XBF_TRYLOCK) {
+ if (!xfs_buf_trylock(bp)) {
+ XFS_STATS_INC(bp->b_mount, xb_busy_locked);
return -EAGAIN;
}
+ } else {
xfs_buf_lock(bp);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
+ XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
}
/*
@@ -652,32 +561,107 @@ found:
*/
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- ASSERT(bp->b_iodone == NULL);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
+ return 0;
+}
+
+static inline int
+xfs_buf_lookup(
+ struct xfs_perag *pag,
+ struct xfs_buf_map *map,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ rcu_read_lock();
+ bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ error = xfs_buf_find_lock(bp, flags);
+ if (error) {
+ xfs_buf_rele(bp);
+ return error;
+ }
trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- *found_bp = bp;
+ *bpp = bp;
return 0;
}
-struct xfs_buf *
-xfs_buf_incore(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+/*
+ * Insert the new_bp into the hash table. This consumes the perag reference
+ * taken for the lookup regardless of the result of the insert.
+ */
+static int
+xfs_buf_find_insert(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag,
+ struct xfs_buf_map *cmap,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
+ struct xfs_buf *new_bp;
struct xfs_buf *bp;
int error;
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
+ error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
- return NULL;
- return bp;
+ goto out_drop_pag;
+
+ /*
+ * For buffers that fit entirely within a single page, first attempt to
+ * allocate the memory from the heap to minimise memory usage. If we
+ * can't get heap memory for these small buffers, we fall back to using
+ * the page allocator.
+ */
+ if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+ xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+ error = xfs_buf_alloc_pages(new_bp, flags);
+ if (error)
+ goto out_free_buf;
+ }
+
+ spin_lock(&pag->pag_buf_lock);
+ bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head, xfs_buf_hash_params);
+ if (IS_ERR(bp)) {
+ error = PTR_ERR(bp);
+ spin_unlock(&pag->pag_buf_lock);
+ goto out_free_buf;
+ }
+ if (bp) {
+ /* found an existing buffer */
+ atomic_inc(&bp->b_hold);
+ spin_unlock(&pag->pag_buf_lock);
+ error = xfs_buf_find_lock(bp, flags);
+ if (error)
+ xfs_buf_rele(bp);
+ else
+ *bpp = bp;
+ goto out_free_buf;
+ }
+
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
+ *bpp = new_bp;
+ return 0;
+
+out_free_buf:
+ xfs_buf_free(new_bp);
+out_drop_pag:
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -687,48 +671,56 @@ xfs_buf_incore(
*/
int
xfs_buf_get_map(
- struct xfs_buftarg *target,
+ struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
- struct xfs_buf *new_bp;
- int error = 0;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp = NULL;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
+ int error;
+ int i;
- *bpp = NULL;
- error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
- if (!error)
- goto found;
- if (error != -ENOENT)
- return error;
+ for (i = 0; i < nmaps; i++)
+ cmap.bm_len += map[i].bm_len;
- error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+ error = xfs_buf_map_verify(btp, &cmap);
if (error)
return error;
- error = xfs_buf_allocate_memory(new_bp, flags);
- if (error) {
- xfs_buf_free(new_bp);
- return error;
- }
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
- error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
- if (error) {
- xfs_buf_free(new_bp);
- return error;
- }
+ error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ if (error && error != -ENOENT)
+ goto out_put_perag;
+
+ /* cache hits always outnumber misses by at least 10:1 */
+ if (unlikely(!bp)) {
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
+
+ if (flags & XBF_INCORE)
+ goto out_put_perag;
- if (bp != new_bp)
- xfs_buf_free(new_bp);
+ /* xfs_buf_find_insert() consumes the perag reference. */
+ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ flags, &bp);
+ if (error)
+ return error;
+ } else {
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+ xfs_perag_put(pag);
+ }
-found:
+ /* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
- xfs_warn(target->bt_mount,
- "%s: failed to map pagesn", __func__);
+ xfs_warn_ratelimited(btp->bt_mount,
+ "%s: failed to map %u pages", __func__,
+ bp->b_page_count);
xfs_buf_relse(bp);
return error;
}
@@ -741,21 +733,25 @@ found:
if (!(flags & XBF_READ))
xfs_buf_ioerror(bp, 0);
- XFS_STATS_INC(target->bt_mount, xb_get);
+ XFS_STATS_INC(btp->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
+
+out_put_perag:
+ xfs_perag_put(pag);
+ return error;
}
-STATIC int
+int
_xfs_buf_read(
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
- bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
return xfs_buf_submit(bp);
@@ -852,7 +848,15 @@ xfs_buf_read_map(
* buffer.
*/
if (error) {
- if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
+ /*
+ * Check against log shutdown for error reporting because
+ * metadata writeback may require a read first and we need to
+ * report errors in metadata writeback until the log is shut
+ * down. High level transaction read functions already check
+ * against mount shutdown, anyway, so we only need to be
+ * concerned about low level IO interactions here.
+ */
+ if (!xlog_is_shutdown(target->bt_mount->m_log))
xfs_buf_ioerror_alert(bp, fa);
bp->b_flags &= ~XBF_DONE;
@@ -882,9 +886,6 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_bdi))
- return;
-
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -892,14 +893,16 @@ xfs_buf_readahead_map(
/*
* Read an uncached buffer from disk. Allocates and returns a locked
- * buffer containing the disk contents or nothing.
+ * buffer containing the disk contents or nothing. Uncached buffers always have
+ * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
+ * is cached or uncached during fault diagnosis.
*/
int
xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -914,7 +917,7 @@ xfs_buf_read_uncached(
/* set up the buffer for a read IO */
ASSERT(bp->b_map_count == 1);
- bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
+ bp->b_rhash_key = XFS_BUF_DADDR_NULL;
bp->b_maps[0].bm_bn = daddr;
bp->b_flags |= XBF_READ;
bp->b_ops = ops;
@@ -934,11 +937,10 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- unsigned long page_count;
- int error, i;
+ int error;
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
@@ -947,41 +949,25 @@ xfs_buf_get_uncached(
/* flags might contain irrelevant bits, pass only what we care about */
error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
if (error)
- goto fail;
+ return error;
- page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
- error = _xfs_buf_get_pages(bp, page_count);
+ error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
- for (i = 0; i < page_count; i++) {
- bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
- if (!bp->b_pages[i]) {
- error = -ENOMEM;
- goto fail_free_mem;
- }
- }
- bp->b_flags |= _XBF_PAGES;
-
error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages", __func__);
- goto fail_free_mem;
+ goto fail_free_buf;
}
trace_xfs_buf_get_uncached(bp, _RET_IP_);
*bpp = bp;
return 0;
- fail_free_mem:
- while (--i >= 0)
- __free_page(bp->b_pages[i]);
- _xfs_buf_free_pages(bp);
- fail_free_buf:
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_zone, bp);
- fail:
+fail_free_buf:
+ xfs_buf_free(bp);
return error;
}
@@ -992,7 +978,7 @@ xfs_buf_get_uncached(
*/
void
xfs_buf_hold(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
trace_xfs_buf_hold(bp, _RET_IP_);
atomic_inc(&bp->b_hold);
@@ -1004,7 +990,7 @@ xfs_buf_hold(
*/
void
xfs_buf_rele(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_perag *pag = bp->b_pag;
bool release;
@@ -1148,7 +1134,7 @@ xfs_buf_unlock(
STATIC void
xfs_buf_wait_unpin(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
DECLARE_WAITQUEUE (wait, current);
@@ -1166,20 +1152,145 @@ xfs_buf_wait_unpin(
set_current_state(TASK_RUNNING);
}
+static void
+xfs_buf_ioerror_alert_ratelimited(
+ struct xfs_buf *bp)
+{
+ static unsigned long lasttime;
+ static struct xfs_buftarg *lasttarg;
+
+ if (bp->b_target != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ xfs_buf_ioerror_alert(bp, __this_address);
+ }
+ lasttarg = bp->b_target;
+}
+
/*
- * Buffer Utility Routines
+ * Account for this latest trip around the retry handler, and decide if
+ * we've failed enough times to constitute a permanent failure.
*/
+static bool
+xfs_buf_ioerror_permanent(
+ struct xfs_buf *bp,
+ struct xfs_error_cfg *cfg)
+{
+ struct xfs_mount *mp = bp->b_mount;
-void
+ if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+ ++bp->b_retries > cfg->max_retries)
+ return true;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+ return true;
+
+ /* At unmount we may treat errors differently */
+ if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
+ return true;
+
+ return false;
+}
+
+/*
+ * On a sync write or shutdown we just want to stale the buffer and let the
+ * caller handle the error in bp->b_error appropriately.
+ *
+ * If the write was asynchronous then no one will be looking for the error. If
+ * this is the first failure of this type, clear the error state and write the
+ * buffer out again. This means we always retry an async write failure at least
+ * once, but we also need to set the buffer up to behave correctly now for
+ * repeated failures.
+ *
+ * If we get repeated async write failures, then we take action according to the
+ * error configuration we have been set up to use.
+ *
+ * Returns true if this function took care of error handling and the caller must
+ * not touch the buffer again. Return false if the caller should proceed with
+ * normal I/O completion handling.
+ */
+static bool
+xfs_buf_ioend_handle_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_error_cfg *cfg;
+
+ /*
+ * If we've already shutdown the journal because of I/O errors, there's
+ * no point in giving this a retry.
+ */
+ if (xlog_is_shutdown(mp->m_log))
+ goto out_stale;
+
+ xfs_buf_ioerror_alert_ratelimited(bp);
+
+ /*
+ * We're not going to bother about retrying this during recovery.
+ * One strike!
+ */
+ if (bp->b_flags & _XBF_LOGRECOVERY) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+ }
+
+ /*
+ * Synchronous writes will have callers process the error.
+ */
+ if (!(bp->b_flags & XBF_ASYNC))
+ goto out_stale;
+
+ trace_xfs_buf_iodone_async(bp, _RET_IP_);
+
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+ if (bp->b_last_error != bp->b_error ||
+ !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
+ bp->b_last_error = bp->b_error;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ !bp->b_first_retry_time)
+ bp->b_first_retry_time = jiffies;
+ goto resubmit;
+ }
+
+ /*
+ * Permanent error - we need to trigger a shutdown if we haven't already
+ * to indicate that inconsistency will result from this action.
+ */
+ if (xfs_buf_ioerror_permanent(bp, cfg)) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out_stale;
+ }
+
+ /* Still considered a transient error. Caller will schedule retries. */
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_io_fail(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_io_fail(bp);
+ else
+ ASSERT(list_empty(&bp->b_li_list));
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
+
+resubmit:
+ xfs_buf_ioerror(bp, 0);
+ bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+ xfs_buf_submit(bp);
+ return true;
+out_stale:
+ xfs_buf_stale(bp);
+ bp->b_flags |= XBF_DONE;
+ bp->b_flags &= ~XBF_WRITE;
+ trace_xfs_buf_error_relse(bp, _RET_IP_);
+ return false;
+}
+
+static void
xfs_buf_ioend(
struct xfs_buf *bp)
{
- bool read = bp->b_flags & XBF_READ;
-
trace_xfs_buf_iodone(bp, _RET_IP_);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-
/*
* Pull in IO completion errors now. We are guaranteed to be running
* single threaded, so we don't need the lock to read b_io_error.
@@ -1187,18 +1298,44 @@ xfs_buf_ioend(
if (!bp->b_error && bp->b_io_error)
xfs_buf_ioerror(bp, bp->b_io_error);
- /* Only validate buffers that were read without errors */
- if (read && !bp->b_error && bp->b_ops) {
- ASSERT(!bp->b_iodone);
- bp->b_ops->verify_read(bp);
+ if (bp->b_flags & XBF_READ) {
+ if (!bp->b_error && bp->b_ops)
+ bp->b_ops->verify_read(bp);
+ if (!bp->b_error)
+ bp->b_flags |= XBF_DONE;
+ } else {
+ if (!bp->b_error) {
+ bp->b_flags &= ~XBF_WRITE_FAIL;
+ bp->b_flags |= XBF_DONE;
+ }
+
+ if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
+ return;
+
+ /* clear the retry state */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
+ bp->b_first_retry_time = 0;
+
+ /*
+ * Note that for things like remote attribute buffers, there may
+ * not be a buffer log item here, so processing the buffer log
+ * item must remain optional.
+ */
+ if (bp->b_log_item)
+ xfs_buf_item_done(bp);
+
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_iodone(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_iodone(bp);
+
}
- if (!bp->b_error)
- bp->b_flags |= XBF_DONE;
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
+ _XBF_LOGRECOVERY);
- if (bp->b_iodone)
- (*(bp->b_iodone))(bp);
- else if (bp->b_flags & XBF_ASYNC)
+ if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);
@@ -1209,7 +1346,7 @@ xfs_buf_ioend_work(
struct work_struct *work)
{
struct xfs_buf *bp =
- container_of(work, xfs_buf_t, b_ioend_work);
+ container_of(work, struct xfs_buf, b_ioend_work);
xfs_buf_ioend(bp);
}
@@ -1224,7 +1361,7 @@ xfs_buf_ioend_async(
void
__xfs_buf_ioerror(
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
int error,
xfs_failaddr_t failaddr)
{
@@ -1238,10 +1375,26 @@ xfs_buf_ioerror_alert(
struct xfs_buf *bp,
xfs_failaddr_t func)
{
- xfs_alert(bp->b_mount,
-"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
- func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
- -bp->b_error);
+ xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
+ "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
+ func, (uint64_t)xfs_buf_daddr(bp),
+ bp->b_length, -bp->b_error);
+}
+
+/*
+ * To simulate an I/O failure, the buffer must be locked and held with at least
+ * three references. The LRU reference is dropped by the stale call. The buf
+ * item reference is dropped via ioend processing. The third reference is owned
+ * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ */
+void
+xfs_buf_ioend_fail(
+ struct xfs_buf *bp)
+{
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_stale(bp);
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_ioend(bp);
}
int
@@ -1254,7 +1407,7 @@ xfs_bwrite(
bp->b_flags |= XBF_WRITE;
bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
- XBF_WRITE_FAIL | XBF_DONE);
+ XBF_DONE);
error = xfs_buf_submit(bp);
if (error)
@@ -1268,6 +1421,11 @@ xfs_buf_bio_end_io(
{
struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
+ if (!bio->bi_status &&
+ (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
+ XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+ bio->bi_status = BLK_STS_IOERR;
+
/*
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
@@ -1292,10 +1450,10 @@ xfs_buf_ioapply_map(
int map,
int *buf_offset,
int *count,
- int op)
+ blk_opf_t op)
{
int page_index;
- int total_nr_pages = bp->b_page_count;
+ unsigned int total_nr_pages = bp->b_page_count;
int nr_pages;
struct bio *bio;
sector_t sector = bp->b_maps[map].bm_bn;
@@ -1320,14 +1478,12 @@ xfs_buf_ioapply_map(
next_chunk:
atomic_inc(&bp->b_io_remaining);
- nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
+ nr_pages = bio_max_segs(total_nr_pages);
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio_set_dev(bio, bp->b_target->bt_bdev);
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
@@ -1371,7 +1527,7 @@ _xfs_buf_ioapply(
struct xfs_buf *bp)
{
struct blk_plug plug;
- int op;
+ blk_opf_t op;
int offset;
int size;
int i;
@@ -1397,17 +1553,18 @@ _xfs_buf_ioapply(
SHUTDOWN_CORRUPT_INCORE);
return;
}
- } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+ } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
struct xfs_mount *mp = bp->b_mount;
/*
* non-crc filesystems don't attach verifiers during
* log recovery, so don't warn for such filesystems.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
xfs_warn(mp,
"%s: no buf ops on daddr 0x%llx len %d",
- __func__, bp->b_bn, bp->b_length);
+ __func__, xfs_buf_daddr(bp),
+ bp->b_length);
xfs_hex_dump(bp->b_addr,
XFS_CORRUPTION_DUMP_LEN);
dump_stack();
@@ -1463,7 +1620,7 @@ xfs_buf_iowait(
* safe to reference the buffer after a call to this function unless the caller
* holds an additional reference itself.
*/
-int
+static int
__xfs_buf_submit(
struct xfs_buf *bp,
bool wait)
@@ -1474,12 +1631,24 @@ __xfs_buf_submit(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* on shutdown we stale and complete the buffer immediately */
- if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror(bp, -EIO);
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ /*
+ * On log shutdown we stale and complete the buffer immediately. We can
+ * be called to read the superblock before the log has been set up, so
+ * be careful checking the log state.
+ *
+ * Checking the mount shutdown state here can result in the log tail
+ * moving inappropriately on disk as the log may not yet be shut down.
+ * i.e. failing this buffer on mount shutdown can remove it from the AIL
+ * and move the tail of the log forwards without having written this
+ * buffer to disk. This corrupts the log tail state in memory, and
+ * because the log may not be shut down yet, it can then be propagated
+ * to disk before the log is shutdown. Hence we check log shutdown
+ * state here rather than mount state to avoid corrupting the log tail
+ * on shutdown.
+ */
+ if (bp->b_mount->m_log &&
+ xlog_is_shutdown(bp->b_mount->m_log)) {
+ xfs_buf_ioend_fail(bp);
return -EIO;
}
@@ -1540,7 +1709,6 @@ xfs_buf_offset(
if (bp->b_addr)
return bp->b_addr + offset;
- offset += bp->b_offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
return page_address(page) + (offset & (PAGE_SIZE-1));
}
@@ -1573,6 +1741,28 @@ xfs_buf_zero(
}
/*
+ * Log a message about and stale a buffer that a caller has decided is corrupt.
+ *
+ * This function should be called for the kinds of metadata corruption that
+ * cannot be detect from a verifier, such as incorrect inter-block relationship
+ * data. Do /not/ call this function from a verifier function.
+ *
+ * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
+ * be marked stale, but b_error will not be set. The caller is responsible for
+ * releasing the buffer or fixing it.
+ */
+void
+__xfs_buf_mark_corrupt(
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+
+ xfs_buf_corruption_error(bp, fa);
+ xfs_buf_stale(bp);
+}
+
+/*
* Handling of buffer targets (buftargs).
*/
@@ -1582,7 +1772,7 @@ xfs_buf_zero(
* while freeing all the buffers only held by the LRU.
*/
static enum lru_status
-xfs_buftarg_wait_rele(
+xfs_buftarg_drain_rele(
struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
@@ -1594,7 +1784,7 @@ xfs_buftarg_wait_rele(
if (atomic_read(&bp->b_hold) > 1) {
/* need to wait, so skip it this pass */
- trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
if (!spin_trylock(&bp->b_lock))
@@ -1611,13 +1801,13 @@ xfs_buftarg_wait_rele(
return LRU_REMOVED;
}
+/*
+ * Wait for outstanding I/O on the buftarg to complete.
+ */
void
-xfs_wait_buftarg(
+xfs_buftarg_wait(
struct xfs_buftarg *btp)
{
- LIST_HEAD(dispose);
- int loop = 0;
-
/*
* First wait on the buftarg I/O count for all in-flight buffers to be
* released. This is critical as new buffers do not make the LRU until
@@ -1633,10 +1823,21 @@ xfs_wait_buftarg(
while (percpu_counter_sum(&btp->bt_io_count))
delay(100);
flush_workqueue(btp->bt_mount->m_buf_workqueue);
+}
+
+void
+xfs_buftarg_drain(
+ struct xfs_buftarg *btp)
+{
+ LIST_HEAD(dispose);
+ int loop = 0;
+ bool write_fail = false;
+
+ xfs_buftarg_wait(btp);
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
- list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
&dispose, LONG_MAX);
while (!list_empty(&dispose)) {
@@ -1644,17 +1845,29 @@ xfs_wait_buftarg(
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
if (bp->b_flags & XBF_WRITE_FAIL) {
- xfs_alert(btp->bt_mount,
+ write_fail = true;
+ xfs_buf_alert_ratelimited(bp,
+ "XFS: Corruption Alert",
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
- (long long)bp->b_bn);
- xfs_alert(btp->bt_mount,
-"Please run xfs_repair to determine the extent of the problem.");
+ (long long)xfs_buf_daddr(bp));
}
xfs_buf_rele(bp);
}
if (loop++ != 0)
delay(100);
}
+
+ /*
+ * If one or more failed buffers were freed, that means dirty metadata
+ * was thrown away. This should only ever happen after I/O completion
+ * handling has elevated I/O error(s) to permanent failures and shuts
+ * down the journal.
+ */
+ if (write_fail) {
+ ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
+ xfs_alert(btp->bt_mount,
+ "Please run xfs_repair to determine the extent of the problem.");
+ }
}
static enum lru_status
@@ -1731,7 +1944,8 @@ xfs_free_buftarg(
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- xfs_blkdev_issue_flush(btp);
+ blkdev_issue_flush(btp->bt_bdev);
+ fs_put_dax(btp->bt_daxdev, btp->bt_mount);
kmem_free(btp);
}
@@ -1772,20 +1986,31 @@ xfs_setsize_buftarg_early(
return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}
-xfs_buftarg_t *
+struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- struct dax_device *dax_dev)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;
+ const struct dax_holder_operations *ops = NULL;
+#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
+ ops = &xfs_dax_holder_operations;
+#endif
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = dax_dev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+ mp, ops);
+
+ /*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages
+ * per 30 seconds so as to not spam logs too much on repeated errors.
+ */
+ ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+ DEFAULT_RATELIMIT_BURST);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error_free;
@@ -1800,7 +2025,8 @@ xfs_alloc_buftarg(
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
- if (register_shrinker(&btp->bt_shrinker))
+ if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
+ mp->m_super->s_id))
goto error_pcpu;
return btp;
@@ -1890,9 +2116,9 @@ xfs_buf_delwri_queue(
*/
static int
xfs_buf_cmp(
- void *priv,
- struct list_head *a,
- struct list_head *b)
+ void *priv,
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
@@ -1927,12 +2153,13 @@ xfs_buf_delwri_submit_buffers(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
if (!wait_list) {
+ if (!xfs_buf_trylock(bp))
+ continue;
if (xfs_buf_ispinned(bp)) {
+ xfs_buf_unlock(bp);
pinned++;
continue;
}
- if (!xfs_buf_trylock(bp))
- continue;
} else {
xfs_buf_lock(bp);
}
@@ -1957,7 +2184,7 @@ xfs_buf_delwri_submit_buffers(
* synchronously. Otherwise, drop the buffer from the delwri
* queue and submit async.
*/
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ bp->b_flags &= ~_XBF_DELWRI_Q;
bp->b_flags |= XBF_WRITE;
if (wait_list) {
bp->b_flags &= ~XBF_ASYNC;
@@ -2088,27 +2315,6 @@ xfs_buf_delwri_pushbuf(
return error;
}
-int __init
-xfs_buf_init(void)
-{
- xfs_buf_zone = kmem_cache_create("xfs_buf",
- sizeof(struct xfs_buf), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!xfs_buf_zone)
- goto out;
-
- return 0;
-
- out:
- return -ENOMEM;
-}
-
-void
-xfs_buf_terminate(void)
-{
- kmem_cache_destroy(xfs_buf_zone);
-}
-
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
/*
@@ -2135,7 +2341,7 @@ xfs_verify_magic(
struct xfs_mount *mp = bp->b_mount;
int idx;
- idx = xfs_sb_version_hascrc(&mp->m_sb);
+ idx = xfs_has_crc(mp);
if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
return false;
return dmagic == bp->b_ops->magic[idx];
@@ -2153,7 +2359,7 @@ xfs_verify_magic16(
struct xfs_mount *mp = bp->b_mount;
int idx;
- idx = xfs_sb_version_hascrc(&mp->m_sb);
+ idx = xfs_has_crc(mp);
if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
return false;
return dmagic == bp->b_ops->magic16[idx];
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d79a1fe5d738..549c60942208 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -15,29 +15,39 @@
#include <linux/uio.h>
#include <linux/list_lru.h>
+extern struct kmem_cache *xfs_buf_cache;
+
/*
* Base types
*/
+struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
+#define XBF_READ (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
-/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
+/* buffer type flags for write callbacks */
+#define _XBF_INODES (1u << 16)/* inode buffer */
+#define _XBF_DQUOTS (1u << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1u << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
+
+/* flags used only as arguments to access routines */
+#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
+#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
+
typedef unsigned int xfs_buf_flags_t;
@@ -50,12 +60,16 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
- { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
- { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
+ { _XBF_INODES, "INODES" }, \
+ { _XBF_DQUOTS, "DQUOTS" }, \
+ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
- { _XBF_DELWRI_Q, "DELWRI_Q" }
-
+ { _XBF_DELWRI_Q, "DELWRI_Q" }, \
+ /* The following interface flags should never be set */ \
+ { XBF_INCORE, "INCORE" }, \
+ { XBF_TRYLOCK, "TRYLOCK" }, \
+ { XBF_UNMAPPED, "UNMAPPED" }
/*
* Internal state flags.
@@ -80,6 +94,7 @@ typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
+ u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
size_t bt_meta_sectormask;
@@ -91,12 +106,9 @@ typedef struct xfs_buftarg {
struct list_lru bt_lru;
struct percpu_counter bt_io_count;
+ struct ratelimit_state bt_ioerror_rl;
} xfs_buftarg_t;
-struct xfs_buf;
-typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-
-
#define XB_PAGES 2
struct xfs_buf_map {
@@ -118,7 +130,7 @@ struct xfs_buf_ops {
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
};
-typedef struct xfs_buf {
+struct xfs_buf {
/*
* first cacheline holds all the fields needed for an uncontended cache
* hit to be fully processed. The semaphore straddles the cacheline
@@ -127,7 +139,8 @@ typedef struct xfs_buf {
* fast-path on locking.
*/
struct rhash_head b_rhash_head; /* pag buffer hash node */
- xfs_daddr_t b_bn; /* block number of buffer */
+
+ xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
@@ -146,10 +159,9 @@ typedef struct xfs_buf {
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
struct xfs_mount *b_mount;
- xfs_buftarg_t *b_target; /* buffer target (device) */
+ struct xfs_buftarg *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
struct work_struct b_ioend_work;
- xfs_buf_iodone_t b_iodone; /* I/O completion function */
struct completion b_iowait; /* queue for I/O waiters */
struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
@@ -162,7 +174,8 @@ typedef struct xfs_buf {
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
- unsigned int b_offset; /* page offset in first page */
+ unsigned int b_offset; /* page offset of b_addr,
+ only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
/*
@@ -185,13 +198,10 @@ typedef struct xfs_buf {
int b_last_error;
const struct xfs_buf_ops *b_ops;
-} xfs_buf_t;
+ struct rcu_head b_rcu;
+};
/* Finding and Reading Buffers */
-struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
- xfs_daddr_t blkno, size_t numblks,
- xfs_buf_flags_t flags);
-
int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
@@ -202,6 +212,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target,
const struct xfs_buf_ops *ops);
static inline int
+xfs_buf_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+ return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
+}
+
+static inline int
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
@@ -239,39 +262,41 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
- struct xfs_buf **bpp);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, int flags, struct xfs_buf **bpp,
- const struct xfs_buf_ops *ops);
+ size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
+int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
-extern void xfs_buf_rele(xfs_buf_t *);
+extern void xfs_buf_rele(struct xfs_buf *);
/* Locking and Unlocking Buffers */
-extern int xfs_buf_trylock(xfs_buf_t *);
-extern void xfs_buf_lock(xfs_buf_t *);
-extern void xfs_buf_unlock(xfs_buf_t *);
+extern int xfs_buf_trylock(struct xfs_buf *);
+extern void xfs_buf_lock(struct xfs_buf *);
+extern void xfs_buf_unlock(struct xfs_buf *);
#define xfs_buf_islocked(bp) \
((bp)->b_sema.count <= 0)
+static inline void xfs_buf_relse(struct xfs_buf *bp)
+{
+ xfs_buf_unlock(bp);
+ xfs_buf_rele(bp);
+}
+
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(struct xfs_buf *bp);
+
extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
-
-extern int __xfs_buf_submit(struct xfs_buf *bp, bool);
-static inline int xfs_buf_submit(struct xfs_buf *bp)
-{
- bool wait = bp->b_flags & XBF_ASYNC ? false : true;
- return __xfs_buf_submit(bp, wait);
-}
-
+void xfs_buf_ioend_fail(struct xfs_buf *);
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
+void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
+#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
@@ -284,22 +309,10 @@ extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
-/*
- * These macros use the IO block map rather than b_bn. b_bn is now really
- * just for the buffer cache index for cached buffers. As IO does not use b_bn
- * anymore, uncached buffers do not use b_bn at all and hence must modify the IO
- * map directly. Uncached buffers are not allowed to be discontiguous, so this
- * is safe to do.
- *
- * In future, uncached buffers will pass the block number directly to the io
- * request function and hence these macros will go away at that point.
- */
-#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
-#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
+static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
+{
+ return bp->b_maps[0].bm_bn;
+}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
@@ -320,12 +333,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp)
return atomic_read(&bp->b_pin_count);
}
-static inline void xfs_buf_relse(xfs_buf_t *bp)
-{
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
-}
-
static inline int
xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
{
@@ -343,21 +350,16 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
/*
* Handling of buftargs.
*/
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, struct dax_device *);
+struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
+ struct block_device *bdev);
extern void xfs_free_buftarg(struct xfs_buftarg *);
-extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
+extern void xfs_buftarg_wait(struct xfs_buftarg *);
+extern void xfs_buftarg_drain(struct xfs_buftarg *);
+extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-static inline int
-xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
-{
- return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
-}
-
int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 663810e6cd59..522d450a94b1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -12,21 +12,25 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
-#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
-kmem_zone_t *xfs_buf_item_zone;
+struct kmem_cache *xfs_buf_item_cache;
static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
-STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
-
/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
@@ -52,38 +56,86 @@ xfs_buf_log_format_size(
(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
+static inline bool
+xfs_buf_item_straddle(
+ struct xfs_buf *bp,
+ uint offset,
+ int first_bit,
+ int nbits)
+{
+ void *first, *last;
+
+ first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
+ last = xfs_buf_offset(bp,
+ offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
+
+ if (last - first != nbits * XFS_BLF_CHUNK)
+ return true;
+ return false;
+}
+
/*
- * This returns the number of log iovecs needed to log the
- * given buf log item.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item segment.
*
- * It calculates this as 1 iovec for the buf log format structure
- * and 1 for each stretch of non-contiguous chunks to be logged.
- * Contiguous chunks are logged in a single iovec.
- *
- * If the XFS_BLI_STALE flag has been set, then log nothing.
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
+ * in a single iovec.
*/
STATIC void
xfs_buf_item_size_segment(
struct xfs_buf_log_item *bip,
struct xfs_buf_log_format *blfp,
+ uint offset,
int *nvecs,
int *nbytes)
{
struct xfs_buf *bp = bip->bli_buf;
+ int first_bit;
+ int nbits;
int next_bit;
int last_bit;
- last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
- if (last_bit == -1)
+ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+ if (first_bit == -1)
return;
- /*
- * initial count for a dirty buffer is 2 vectors - the format structure
- * and the first dirty region.
- */
- *nvecs += 2;
- *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
+ (*nvecs)++;
+ *nbytes += xfs_buf_log_format_size(blfp);
+
+ do {
+ nbits = xfs_contig_bits(blfp->blf_data_map,
+ blfp->blf_map_size, first_bit);
+ ASSERT(nbits > 0);
+
+ /*
+ * Straddling a page is rare because we don't log contiguous
+ * chunks of unmapped buffers anywhere.
+ */
+ if (nbits > 1 &&
+ xfs_buf_item_straddle(bp, offset, first_bit, nbits))
+ goto slow_scan;
+ (*nvecs)++;
+ *nbytes += nbits * XFS_BLF_CHUNK;
+
+ /*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there. It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ */
+ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+ (uint)first_bit + nbits + 1);
+ } while (first_bit != -1);
+
+ return;
+
+slow_scan:
+ /* Count the first bit we jumped out of the above loop from */
+ (*nvecs)++;
+ *nbytes += XFS_BLF_CHUNK;
+ last_bit = first_bit;
while (last_bit != -1) {
/*
* This takes the bit number to start looking from and
@@ -100,29 +152,25 @@ xfs_buf_item_size_segment(
*/
if (next_bit == -1) {
break;
- } else if (next_bit != last_bit + 1) {
- last_bit = next_bit;
- (*nvecs)++;
- } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
- (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
- XFS_BLF_CHUNK)) {
+ } else if (next_bit != last_bit + 1 ||
+ xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
last_bit = next_bit;
+ first_bit = next_bit;
(*nvecs)++;
+ nbits = 1;
} else {
last_bit++;
+ nbits++;
}
*nbytes += XFS_BLF_CHUNK;
}
}
/*
- * This returns the number of log iovecs needed to log the given buf log item.
- *
- * It calculates this as 1 iovec for the buf log format structure and 1 for each
- * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
- * in a single iovec.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item.
*
- * Discontiguous buffers need a format structure per region that that is being
+ * Discontiguous buffers need a format structure per region that is being
* logged. This makes the changes in the buffer appear to log recovery as though
* they came from separate buffers, just like would occur if multiple buffers
* were used instead of a single discontiguous buffer. This enables
@@ -130,7 +178,11 @@ xfs_buf_item_size_segment(
* what ends up on disk.
*
* If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
- * format structures.
+ * format structures. If the item has previously been logged and has dirty
+ * regions, we do not relog them in stale buffers. This has the effect of
+ * reducing the size of the relogged item by the amount of dirty data tracked
+ * by the log item. This can result in the committing transaction reducing the
+ * amount of space being consumed by the CIL.
*/
STATIC void
xfs_buf_item_size(
@@ -139,14 +191,17 @@ xfs_buf_item_size(
int *nbytes)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
int i;
+ int bytes;
+ uint offset = 0;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
if (bip->bli_flags & XFS_BLI_STALE) {
/*
- * The buffer is stale, so all we need to log
- * is the buf log format structure with the
- * cancel flag in it.
+ * The buffer is stale, so all we need to log is the buf log
+ * format structure with the cancel flag in it as we are never
+ * going to replay the changes tracked in the log item.
*/
trace_xfs_buf_item_size_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
@@ -161,9 +216,9 @@ xfs_buf_item_size(
if (bip->bli_flags & XFS_BLI_ORDERED) {
/*
- * The buffer has been logged just to order it.
- * It is not being included in the transaction
- * commit, so no vectors are used at all.
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so no vectors are used at
+ * all.
*/
trace_xfs_buf_item_size_ordered(bip);
*nvecs = XFS_LOG_VEC_ORDERED;
@@ -171,7 +226,7 @@ xfs_buf_item_size(
}
/*
- * the vector count is based on the number of buffer vectors we have
+ * The vector count is based on the number of buffer vectors we have
* dirty bits in. This will only be greater than one when we have a
* compound buffer with more than one segment dirty. Hence for compound
* buffers we need to track which segment the dirty bits correspond to,
@@ -179,10 +234,19 @@ xfs_buf_item_size(
* count for the extra buf log format structure that will need to be
* written.
*/
+ bytes = 0;
for (i = 0; i < bip->bli_format_count; i++) {
- xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
- nvecs, nbytes);
+ xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset,
+ nvecs, &bytes);
+ offset += BBTOB(bp->b_maps[i].bm_len);
}
+
+ /*
+ * Round up the buffer size required to minimise the number of memory
+ * allocations that need to be done as this item grows when relogged by
+ * repeated modifications.
+ */
+ *nbytes = round_up(bytes, 512);
trace_xfs_buf_item_size(bip);
}
@@ -201,18 +265,6 @@ xfs_buf_item_copy_iovec(
nbits * XFS_BLF_CHUNK);
}
-static inline bool
-xfs_buf_item_straddle(
- struct xfs_buf *bp,
- uint offset,
- int next_bit,
- int last_bit)
-{
- return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
- (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
- XFS_BLF_CHUNK);
-}
-
static void
xfs_buf_item_format_segment(
struct xfs_buf_log_item *bip,
@@ -265,6 +317,38 @@ xfs_buf_item_format_segment(
/*
* Fill in an iovec for each set of contiguous chunks.
*/
+ do {
+ ASSERT(first_bit >= 0);
+ nbits = xfs_contig_bits(blfp->blf_data_map,
+ blfp->blf_map_size, first_bit);
+ ASSERT(nbits > 0);
+
+ /*
+ * Straddling a page is rare because we don't log contiguous
+ * chunks of unmapped buffers anywhere.
+ */
+ if (nbits > 1 &&
+ xfs_buf_item_straddle(bp, offset, first_bit, nbits))
+ goto slow_scan;
+
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
+
+ /*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there. It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ */
+ first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
+ (uint)first_bit + nbits + 1);
+ } while (first_bit != -1);
+
+ return;
+
+slow_scan:
+ ASSERT(bp->b_addr == NULL);
last_bit = first_bit;
nbits = 1;
for (;;) {
@@ -289,7 +373,7 @@ xfs_buf_item_format_segment(
blfp->blf_size++;
break;
} else if (next_bit != last_bit + 1 ||
- xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
+ xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
@@ -345,7 +429,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ if (xfs_has_v3inodes(lip->li_log->l_mp) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -391,17 +475,8 @@ xfs_buf_item_pin(
}
/*
- * This is called to unpin the buffer associated with the buf log
- * item which was previously pinned with a call to xfs_buf_item_pin().
- *
- * Also drop the reference to the buf item for the current transaction.
- * If the XFS_BLI_STALE flag is set and we are the last reference,
- * then free up the buf log item and unlock the buffer.
- *
- * If the remove flag is set we are called from uncommit in the
- * forced-shutdown path. If that is true and the reference count on
- * the log item is going to drop to zero we need to free the item's
- * descriptor in the transaction.
+ * This is called to unpin the buffer associated with the buf log item which
+ * was previously pinned with a call to xfs_buf_item_pin().
*/
STATIC void
xfs_buf_item_unpin(
@@ -409,8 +484,7 @@ xfs_buf_item_unpin(
int remove)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
- xfs_buf_t *bp = bip->bli_buf;
- struct xfs_ail *ailp = lip->li_ailp;
+ struct xfs_buf *bp = bip->bli_buf;
int stale = bip->bli_flags & XFS_BLI_STALE;
int freed;
@@ -419,91 +493,63 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin(bip);
+ /*
+ * Drop the bli ref associated with the pin and grab the hold required
+ * for the I/O simulation failure in the abort case. We have to do this
+ * before the pin count drops because the AIL doesn't acquire a bli
+ * reference. Therefore if the refcount drops to zero, the bli could
+ * still be AIL resident and the buffer submitted for I/O (and freed on
+ * completion) at any point before we return. This can be removed once
+ * the AIL properly holds a reference on the bli.
+ */
freed = atomic_dec_and_test(&bip->bli_refcount);
-
+ if (freed && !stale && remove)
+ xfs_buf_hold(bp);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
- if (freed && stale) {
+ /* nothing to do but drop the pin count if the bli is active */
+ if (!freed)
+ return;
+
+ if (stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
ASSERT(bp->b_flags & XBF_STALE);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!bp->b_transp);
trace_xfs_buf_item_unpin_stale(bip);
- if (remove) {
- /*
- * If we are in a transaction context, we have to
- * remove the log item from the transaction as we are
- * about to release our reference to the buffer. If we
- * don't, the unlock that occurs later in
- * xfs_trans_uncommit() will try to reference the
- * buffer which we no longer have a hold on.
- */
- if (!list_empty(&lip->li_trans))
- xfs_trans_del_item(lip);
-
- /*
- * Since the transaction no longer refers to the buffer,
- * the buffer should no longer refer to the transaction.
- */
- bp->b_transp = NULL;
- }
-
/*
- * If we get called here because of an IO error, we may
- * or may not have the item on the AIL. xfs_trans_ail_delete()
- * will take care of that situation.
- * xfs_trans_ail_delete() drops the AIL lock.
+ * If we get called here because of an IO error, we may or may
+ * not have the item on the AIL. xfs_trans_ail_delete() will
+ * take care of that situation. xfs_trans_ail_delete() drops
+ * the AIL lock.
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_do_callbacks(bp);
- bp->b_log_item = NULL;
- list_del_init(&bp->b_li_list);
- bp->b_iodone = NULL;
+ xfs_buf_item_done(bp);
+ xfs_buf_inode_iodone(bp);
+ ASSERT(list_empty(&bp->b_li_list));
} else {
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
- } else if (freed && remove) {
+ } else if (remove) {
/*
- * There are currently two references to the buffer - the active
- * LRU reference and the buf log item. What we are about to do
- * here - simulate a failed IO completion - requires 3
- * references.
- *
- * The LRU reference is removed by the xfs_buf_stale() call. The
- * buf item reference is removed by the xfs_buf_iodone()
- * callback that is run by xfs_buf_do_callbacks() during ioend
- * processing (via the bp->b_iodone callback), and then finally
- * the ioend processing will drop the IO reference if the buffer
- * is marked XBF_ASYNC.
- *
- * Hence we need to take an additional reference here so that IO
- * completion processing doesn't free the buffer prematurely.
+ * The buffer must be locked and held by the caller to simulate
+ * an async I/O failure. We acquired the hold for this case
+ * before the buffer was unpinned.
*/
xfs_buf_lock(bp);
- xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
- xfs_buf_ioerror(bp, -EIO);
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ xfs_buf_ioend_fail(bp);
}
}
-/*
- * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
- * seconds so as to not spam logs too much on repeated detection of the same
- * buffer being bad..
- */
-
-static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
-
STATIC uint
xfs_buf_item_push(
struct xfs_log_item *lip,
@@ -533,11 +579,10 @@ xfs_buf_item_push(
trace_xfs_buf_item_push(bip);
/* has a previous flush failed due to IO errors? */
- if ((bp->b_flags & XBF_WRITE_FAIL) &&
- ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
- xfs_warn(bp->b_mount,
-"Failing async write on buffer block 0x%llx. Retrying async write.",
- (long long)bp->b_bn);
+ if (bp->b_flags & XBF_WRITE_FAIL) {
+ xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
+ "Failing async write on buffer block 0x%llx. Retrying async write.",
+ (long long)xfs_buf_daddr(bp));
}
if (!xfs_buf_delwri_queue(bp, buffer_list))
@@ -572,7 +617,7 @@ xfs_buf_item_put(
* that case, the bli is freed on buffer writeback completion.
*/
aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- XFS_FORCED_SHUTDOWN(lip->li_mountp);
+ xlog_is_shutdown(lip->li_log);
dirty = bip->bli_flags & XFS_BLI_DIRTY;
if (dirty && !aborted)
return false;
@@ -584,7 +629,7 @@ xfs_buf_item_put(
* state.
*/
if (aborted)
- xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(lip, 0);
xfs_buf_item_relse(bip->bli_buf);
return true;
}
@@ -657,7 +702,7 @@ xfs_buf_item_release(
STATIC void
xfs_buf_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
return xfs_buf_item_release(lip);
}
@@ -760,7 +805,7 @@ xfs_buf_item_init(
return 0;
}
- bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
+ bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
@@ -781,7 +826,7 @@ xfs_buf_item_init(
map_size = DIV_ROUND_UP(chunks, NBWORD);
if (map_size > XFS_BLF_DATAMAP_SIZE) {
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
xfs_err(mp,
"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
map_size,
@@ -958,19 +1003,15 @@ xfs_buf_item_free(
{
xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_cache, bip);
}
/*
- * This is called when the buf log item is no longer needed. It should
- * free the buf log item associated with the given buffer and clear
- * the buffer's pointer to the buf log item. If there are no more
- * items in the list, clear the b_iodone field of the buffer (see
- * xfs_buf_attach_iodone() below).
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
*/
void
xfs_buf_item_relse(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -978,312 +1019,28 @@ xfs_buf_item_relse(
ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
bp->b_log_item = NULL;
- if (list_empty(&bp->b_li_list))
- bp->b_iodone = NULL;
-
xfs_buf_rele(bp);
xfs_buf_item_free(bip);
}
-
-/*
- * Add the given log item with its callback to the list of callbacks
- * to be called when the buffer's I/O completes. If it is not set
- * already, set the buffer's b_iodone() routine to be
- * xfs_buf_iodone_callbacks() and link the log item into the list of
- * items rooted at b_li_list.
- */
void
-xfs_buf_attach_iodone(
- struct xfs_buf *bp,
- void (*cb)(struct xfs_buf *, struct xfs_log_item *),
- struct xfs_log_item *lip)
-{
- ASSERT(xfs_buf_islocked(bp));
-
- lip->li_cb = cb;
- list_add_tail(&lip->li_bio_list, &bp->b_li_list);
-
- ASSERT(bp->b_iodone == NULL ||
- bp->b_iodone == xfs_buf_iodone_callbacks);
- bp->b_iodone = xfs_buf_iodone_callbacks;
-}
-
-/*
- * We can have many callbacks on a buffer. Running the callbacks individually
- * can cause a lot of contention on the AIL lock, so we allow for a single
- * callback to be able to scan the remaining items in bp->b_li_list for other
- * items of the same type and callback to be processed in the first call.
- *
- * As a result, the loop walking the callback list below will also modify the
- * list. it removes the first item from the list and then runs the callback.
- * The loop then restarts from the new first item int the list. This allows the
- * callback to scan and modify the list attached to the buffer and we don't
- * have to care about maintaining a next item pointer.
- */
-STATIC void
-xfs_buf_do_callbacks(
- struct xfs_buf *bp)
-{
- struct xfs_buf_log_item *blip = bp->b_log_item;
- struct xfs_log_item *lip;
-
- /* If there is a buf_log_item attached, run its callback */
- if (blip) {
- lip = &blip->bli_item;
- lip->li_cb(bp, lip);
- }
-
- while (!list_empty(&bp->b_li_list)) {
- lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
- li_bio_list);
-
- /*
- * Remove the item from the list, so we don't have any
- * confusion if the item is added to another buf.
- * Don't touch the log item after calling its
- * callback, because it could have freed itself.
- */
- list_del_init(&lip->li_bio_list);
- lip->li_cb(bp, lip);
- }
-}
-
-/*
- * Invoke the error state callback for each log item affected by the failed I/O.
- *
- * If a metadata buffer write fails with a non-permanent error, the buffer is
- * eventually resubmitted and so the completion callbacks are not run. The error
- * state may need to be propagated to the log items attached to the buffer,
- * however, so the next AIL push of the item knows hot to handle it correctly.
- */
-STATIC void
-xfs_buf_do_callbacks_fail(
- struct xfs_buf *bp)
-{
- struct xfs_log_item *lip;
- struct xfs_ail *ailp;
-
- /*
- * Buffer log item errors are handled directly by xfs_buf_item_push()
- * and xfs_buf_iodone_callback_error, and they have no IO error
- * callbacks. Check only for items in b_li_list.
- */
- if (list_empty(&bp->b_li_list))
- return;
-
- lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
- li_bio_list);
- ailp = lip->li_ailp;
- spin_lock(&ailp->ail_lock);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (lip->li_ops->iop_error)
- lip->li_ops->iop_error(lip, bp);
- }
- spin_unlock(&ailp->ail_lock);
-}
-
-static bool
-xfs_buf_iodone_callback_error(
+xfs_buf_item_done(
struct xfs_buf *bp)
{
- struct xfs_buf_log_item *bip = bp->b_log_item;
- struct xfs_log_item *lip;
- struct xfs_mount *mp;
- static ulong lasttime;
- static xfs_buftarg_t *lasttarg;
- struct xfs_error_cfg *cfg;
-
- /*
- * The failed buffer might not have a buf_log_item attached or the
- * log_item list might be empty. Get the mp from the available
- * xfs_log_item
- */
- lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item,
- li_bio_list);
- mp = lip ? lip->li_mountp : bip->bli_item.li_mountp;
-
- /*
- * If we've already decided to shutdown the filesystem because of
- * I/O errors, there's no point in giving this a retry.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
- goto out_stale;
-
- if (bp->b_target != lasttarg ||
- time_after(jiffies, (lasttime + 5*HZ))) {
- lasttime = jiffies;
- xfs_buf_ioerror_alert(bp, __this_address);
- }
- lasttarg = bp->b_target;
-
- /* synchronous writes will have callers process the error */
- if (!(bp->b_flags & XBF_ASYNC))
- goto out_stale;
-
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
- ASSERT(bp->b_iodone != NULL);
-
- cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
-
- /*
- * If the write was asynchronous then no one will be looking for the
- * error. If this is the first failure of this type, clear the error
- * state and write the buffer out again. This means we always retry an
- * async write failure at least once, but we also need to set the buffer
- * up to behave correctly now for repeated failures.
- */
- if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
- bp->b_last_error != bp->b_error) {
- bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
- bp->b_last_error = bp->b_error;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- !bp->b_first_retry_time)
- bp->b_first_retry_time = jiffies;
-
- xfs_buf_ioerror(bp, 0);
- xfs_buf_submit(bp);
- return true;
- }
-
/*
- * Repeated failure on an async write. Take action according to the
- * error configuration we have been set up to use.
- */
-
- if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
- ++bp->b_retries > cfg->max_retries)
- goto permanent_error;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
- goto permanent_error;
-
- /* At unmount we may treat errors differently */
- if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
- goto permanent_error;
-
- /*
- * Still a transient error, run IO completion failure callbacks and let
- * the higher layers retry the buffer.
- */
- xfs_buf_do_callbacks_fail(bp);
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return true;
-
- /*
- * Permanent error - we need to trigger a shutdown if we haven't already
- * to indicate that inconsistency will result from this action.
- */
-permanent_error:
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-out_stale:
- xfs_buf_stale(bp);
- bp->b_flags |= XBF_DONE;
- trace_xfs_buf_error_relse(bp, _RET_IP_);
- return false;
-}
-
-/*
- * This is the iodone() function for buffers which have had callbacks attached
- * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
- * callback list, mark the buffer as having no more callbacks and then push the
- * buffer through IO completion processing.
- */
-void
-xfs_buf_iodone_callbacks(
- struct xfs_buf *bp)
-{
- /*
- * If there is an error, process it. Some errors require us
- * to run callbacks after failure processing is done so we
- * detect that and take appropriate action.
- */
- if (bp->b_error && xfs_buf_iodone_callback_error(bp))
- return;
-
- /*
- * Successful IO or permanent error. Either way, we can clear the
- * retry state here in preparation for the next error that may occur.
- */
- bp->b_last_error = 0;
- bp->b_retries = 0;
- bp->b_first_retry_time = 0;
-
- xfs_buf_do_callbacks(bp);
- bp->b_log_item = NULL;
- list_del_init(&bp->b_li_list);
- bp->b_iodone = NULL;
- xfs_buf_ioend(bp);
-}
-
-/*
- * This is the iodone() function for buffers which have been
- * logged. It is called when they are eventually flushed out.
- * It should remove the buf item from the AIL, and free the buf item.
- * It is called by xfs_buf_iodone_callbacks() above which will take
- * care of cleaning up the buffer itself.
- */
-void
-xfs_buf_iodone(
- struct xfs_buf *bp,
- struct xfs_log_item *lip)
-{
- struct xfs_ail *ailp = lip->li_ailp;
-
- ASSERT(BUF_ITEM(lip)->bli_buf == bp);
-
- xfs_buf_rele(bp);
-
- /*
- * If we are forcibly shutting down, this may well be
- * off the AIL already. That's because we simulate the
- * log-committed callbacks to unpin these buffers. Or we may never
- * have put this item on AIL because of the transaction was
- * aborted forcibly. xfs_trans_ail_delete() takes care of these.
+ * If we are forcibly shutting down, this may well be off the AIL
+ * already. That's because we simulate the log-committed callbacks to
+ * unpin these buffers. Or we may never have put this item on AIL
+ * because of the transaction was aborted forcibly.
+ * xfs_trans_ail_delete() takes care of these.
*
* Either way, AIL is useless if we're forcing a shutdown.
+ *
+ * Note that log recovery writes might have buffer items that are not on
+ * the AIL even when the file system is not shut down.
*/
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
- xfs_buf_item_free(BUF_ITEM(lip));
-}
-
-/*
- * Requeue a failed buffer for writeback.
- *
- * We clear the log item failed state here as well, but we have to be careful
- * about reference counts because the only active reference counts on the buffer
- * may be the failed log items. Hence if we clear the log item failed state
- * before queuing the buffer for IO we can release all active references to
- * the buffer and free it, leading to use after free problems in
- * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
- * order we process them in - the buffer is locked, and we own the buffer list
- * so nothing on them is going to change while we are performing this action.
- *
- * Hence we can safely queue the buffer for IO before we clear the failed log
- * item state, therefore always having an active reference to the buffer and
- * avoiding the transient zero-reference state that leads to use-after-free.
- *
- * Return true if the buffer was added to the buffer list, false if it was
- * already on the buffer list.
- */
-bool
-xfs_buf_resubmit_failed_buffers(
- struct xfs_buf *bp,
- struct list_head *buffer_list)
-{
- struct xfs_log_item *lip;
- bool ret;
-
- ret = xfs_buf_delwri_queue(bp, buffer_list);
-
- /*
- * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
- * function already have it acquired
- */
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
- xfs_clear_li_failed(lip);
-
- return ret;
+ xfs_trans_ail_delete(&bp->b_log_item->bli_item,
+ (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
+ SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_item_relse(bp);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 30114b510332..4d8a6aece995 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -8,15 +8,18 @@
/* kernel only definitions */
+struct xfs_buf;
+struct xfs_mount;
+
/* buf log item flags */
-#define XFS_BLI_HOLD 0x01
-#define XFS_BLI_DIRTY 0x02
-#define XFS_BLI_STALE 0x04
-#define XFS_BLI_LOGGED 0x08
-#define XFS_BLI_INODE_ALLOC_BUF 0x10
-#define XFS_BLI_STALE_INODE 0x20
-#define XFS_BLI_INODE_BUF 0x40
-#define XFS_BLI_ORDERED 0x80
+#define XFS_BLI_HOLD (1u << 0)
+#define XFS_BLI_DIRTY (1u << 1)
+#define XFS_BLI_STALE (1u << 2)
+#define XFS_BLI_LOGGED (1u << 3)
+#define XFS_BLI_INODE_ALLOC_BUF (1u << 4)
+#define XFS_BLI_STALE_INODE (1u << 5)
+#define XFS_BLI_INODE_BUF (1u << 6)
+#define XFS_BLI_ORDERED (1u << 7)
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -28,11 +31,6 @@
{ XFS_BLI_INODE_BUF, "INODE_BUF" }, \
{ XFS_BLI_ORDERED, "ORDERED" }
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_buf_log_item;
-
/*
* This is the in core log item structure used to track information
* needed to log buffers. It tracks how many times the lock has been
@@ -50,19 +48,27 @@ struct xfs_buf_log_item {
};
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void xfs_buf_item_done(struct xfs_buf *bp);
void xfs_buf_item_relse(struct xfs_buf *);
bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
-void xfs_buf_attach_iodone(struct xfs_buf *,
- void(*)(struct xfs_buf *, struct xfs_log_item *),
- struct xfs_log_item *);
-void xfs_buf_iodone_callbacks(struct xfs_buf *);
-void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
- struct list_head *);
+void xfs_buf_inode_iodone(struct xfs_buf *);
+void xfs_buf_inode_io_fail(struct xfs_buf *bp);
+#ifdef CONFIG_XFS_QUOTA
+void xfs_buf_dquot_iodone(struct xfs_buf *);
+void xfs_buf_dquot_io_fail(struct xfs_buf *bp);
+#else
+static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
+{
+}
+static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
+{
+}
+#endif /* CONFIG_XFS_QUOTA */
+void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
-extern kmem_zone_t *xfs_buf_item_zone;
+extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
new file mode 100644
index 000000000000..ffa94102094d
--- /dev/null
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -0,0 +1,1061 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_error.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_quota.h"
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE 64
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+ xfs_daddr_t bc_blkno;
+ uint bc_len;
+ int bc_refcount;
+ struct list_head bc_list;
+};
+
+static struct xfs_buf_cancel *
+xlog_find_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
+
+ if (!log->l_buf_cancel_table)
+ return NULL;
+
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+ return bcp;
+ }
+
+ return NULL;
+}
+
+static bool
+xlog_add_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ struct xfs_buf_cancel *bcp;
+
+ /*
+ * If we find an existing cancel record, this indicates that the buffer
+ * was cancelled multiple times. To ensure that during pass 2 we keep
+ * the record in the table until we reach its last occurrence in the
+ * log, a reference count is kept to tell how many times we expect to
+ * see this record during the second pass.
+ */
+ bcp = xlog_find_buffer_cancelled(log, blkno, len);
+ if (bcp) {
+ bcp->bc_refcount++;
+ return false;
+ }
+
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+ bcp->bc_blkno = blkno;
+ bcp->bc_len = len;
+ bcp->bc_refcount = 1;
+ list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno));
+ return true;
+}
+
+/*
+ * Check if there is and entry for blkno, len in the buffer cancel record table.
+ */
+bool
+xlog_is_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ return xlog_find_buffer_cancelled(log, blkno, len) != NULL;
+}
+
+/*
+ * Check if there is and entry for blkno, len in the buffer cancel record table,
+ * and decremented the reference count on it if there is one.
+ *
+ * Remove the cancel record once the refcount hits zero, so that if the same
+ * buffer is re-used again after its last cancellation we actually replay the
+ * changes made at that point.
+ */
+static bool
+xlog_put_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ struct xfs_buf_cancel *bcp;
+
+ bcp = xlog_find_buffer_cancelled(log, blkno, len);
+ if (!bcp) {
+ ASSERT(0);
+ return false;
+ }
+
+ if (--bcp->bc_refcount == 0) {
+ list_del(&bcp->bc_list);
+ kmem_free(bcp);
+ }
+ return true;
+}
+
+/* log buffer item recovery */
+
+/*
+ * Sort buffer items for log recovery. Most buffer items should end up on the
+ * buffer list and are recovered first, with the following exceptions:
+ *
+ * 1. XFS_BLF_CANCEL buffers must be processed last because some log items
+ * might depend on the incor ecancellation record, and replaying a cancelled
+ * buffer item can remove the incore record.
+ *
+ * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that
+ * we replay di_next_unlinked only after flushing the inode 'free' state
+ * to the inode buffer.
+ *
+ * See xlog_recover_reorder_trans for more details.
+ */
+STATIC enum xlog_recover_reorder
+xlog_recover_buf_reorder(
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+
+ if (buf_f->blf_flags & XFS_BLF_CANCEL)
+ return XLOG_REORDER_CANCEL_LIST;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ return XLOG_REORDER_INODE_BUFFER_LIST;
+ return XLOG_REORDER_BUFFER_LIST;
+}
+
+STATIC void
+xlog_recover_buf_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+
+ xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
+}
+
+/*
+ * Build up the table of buf cancel records so that we don't replay cancelled
+ * data in the second pass.
+ */
+static int
+xlog_recover_buf_commit_pass1(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr;
+
+ if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
+ xfs_err(log->l_mp, "bad buffer log item size (%d)",
+ item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ if (!(bf->blf_flags & XFS_BLF_CANCEL))
+ trace_xfs_log_recover_buf_not_cancel(log, bf);
+ else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len))
+ trace_xfs_log_recover_buf_cancel_add(log, bf);
+ else
+ trace_xfs_log_recover_buf_cancel_ref_inc(log, bf);
+ return 0;
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ * the first 16 bits of the buffer (inode buffer, dquot buffer),
+ * the first 32 bits of the buffer (most blocks),
+ * inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ uint32_t magic32;
+ uint16_t magic16;
+ uint16_t magicda;
+ char *warnmsg = NULL;
+
+ /*
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
+ */
+ if (!xfs_has_crc(mp))
+ return;
+
+ magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+ magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+ magicda = be16_to_cpu(info->magic);
+ switch (xfs_blft_from_flags(buf_f)) {
+ case XFS_BLFT_BTREE_BUF:
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTC_MAGIC:
+ bp->b_ops = &xfs_cntbt_buf_ops;
+ break;
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC:
+ bp->b_ops = &xfs_inobt_buf_ops;
+ break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC:
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ break;
+ case XFS_RMAP_CRC_MAGIC:
+ bp->b_ops = &xfs_rmapbt_buf_ops;
+ break;
+ case XFS_REFC_CRC_MAGIC:
+ bp->b_ops = &xfs_refcountbt_buf_ops;
+ break;
+ default:
+ warnmsg = "Bad btree block magic!";
+ break;
+ }
+ break;
+ case XFS_BLFT_AGF_BUF:
+ if (magic32 != XFS_AGF_MAGIC) {
+ warnmsg = "Bad AGF block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_agf_buf_ops;
+ break;
+ case XFS_BLFT_AGFL_BUF:
+ if (magic32 != XFS_AGFL_MAGIC) {
+ warnmsg = "Bad AGFL block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_agfl_buf_ops;
+ break;
+ case XFS_BLFT_AGI_BUF:
+ if (magic32 != XFS_AGI_MAGIC) {
+ warnmsg = "Bad AGI block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_agi_buf_ops;
+ break;
+ case XFS_BLFT_UDQUOT_BUF:
+ case XFS_BLFT_PDQUOT_BUF:
+ case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+ if (magic16 != XFS_DQUOT_MAGIC) {
+ warnmsg = "Bad DQUOT block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+#else
+ xfs_alert(mp,
+ "Trying to recover dquots without QUOTA support built in!");
+ ASSERT(0);
+#endif
+ break;
+ case XFS_BLFT_DINO_BUF:
+ if (magic16 != XFS_DINODE_MAGIC) {
+ warnmsg = "Bad INODE block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_inode_buf_ops;
+ break;
+ case XFS_BLFT_SYMLINK_BUF:
+ if (magic32 != XFS_SYMLINK_MAGIC) {
+ warnmsg = "Bad symlink block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+ break;
+ case XFS_BLFT_DIR_BLOCK_BUF:
+ if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+ magic32 != XFS_DIR3_BLOCK_MAGIC) {
+ warnmsg = "Bad dir block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ break;
+ case XFS_BLFT_DIR_DATA_BUF:
+ if (magic32 != XFS_DIR2_DATA_MAGIC &&
+ magic32 != XFS_DIR3_DATA_MAGIC) {
+ warnmsg = "Bad dir data magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ break;
+ case XFS_BLFT_DIR_FREE_BUF:
+ if (magic32 != XFS_DIR2_FREE_MAGIC &&
+ magic32 != XFS_DIR3_FREE_MAGIC) {
+ warnmsg = "Bad dir3 free magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAF1_BUF:
+ if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+ magicda != XFS_DIR3_LEAF1_MAGIC) {
+ warnmsg = "Bad dir leaf1 magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAFN_BUF:
+ if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+ magicda != XFS_DIR3_LEAFN_MAGIC) {
+ warnmsg = "Bad dir leafn magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ break;
+ case XFS_BLFT_DA_NODE_BUF:
+ if (magicda != XFS_DA_NODE_MAGIC &&
+ magicda != XFS_DA3_NODE_MAGIC) {
+ warnmsg = "Bad da node magic!";
+ break;
+ }
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_LEAF_BUF:
+ if (magicda != XFS_ATTR_LEAF_MAGIC &&
+ magicda != XFS_ATTR3_LEAF_MAGIC) {
+ warnmsg = "Bad attr leaf magic!";
+ break;
+ }
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_RMT_BUF:
+ if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+ warnmsg = "Bad attr remote magic!";
+ break;
+ }
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ break;
+ case XFS_BLFT_SB_BUF:
+ if (magic32 != XFS_SB_MAGIC) {
+ warnmsg = "Bad SB block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_sb_buf_ops;
+ break;
+#ifdef CONFIG_XFS_RT
+ case XFS_BLFT_RTBITMAP_BUF:
+ case XFS_BLFT_RTSUMMARY_BUF:
+ /* no magic numbers for verification of RT buffers */
+ bp->b_ops = &xfs_rtbuf_ops;
+ break;
+#endif /* CONFIG_XFS_RT */
+ default:
+ xfs_warn(mp, "Unknown buffer type %d!",
+ xfs_blft_from_flags(buf_f));
+ break;
+ }
+
+ /*
+ * Nothing else to do in the case of a NULL current LSN as this means
+ * the buffer is more recent than the change in the log and will be
+ * skipped.
+ */
+ if (current_lsn == NULLCOMMITLSN)
+ return;
+
+ if (warnmsg) {
+ xfs_warn(mp, warnmsg);
+ ASSERT(0);
+ }
+
+ /*
+ * We must update the metadata LSN of the buffer as it is written out to
+ * ensure that older transactions never replay over this one and corrupt
+ * the buffer. This can occur if log recovery is interrupted at some
+ * point after the current transaction completes, at which point a
+ * subsequent mount starts recovery from the beginning.
+ *
+ * Write verifiers update the metadata LSN from log items attached to
+ * the buffer. Therefore, initialize a bli purely to carry the LSN to
+ * the verifier.
+ */
+ if (bp->b_ops) {
+ struct xfs_buf_log_item *bip;
+
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_item_init(bp, mp);
+ bip = bp->b_log_item;
+ bip->bli_item.li_lsn = current_lsn;
+ }
+}
+
+/*
+ * Perform a 'normal' buffer recovery. Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer. The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+STATIC void
+xlog_recover_do_reg_buffer(
+ struct xfs_mount *mp,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f,
+ xfs_lsn_t current_lsn)
+{
+ int i;
+ int bit;
+ int nbits;
+ xfs_failaddr_t fa;
+ const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
+
+ trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
+ bit = 0;
+ i = 1; /* 0 is the buf format structure */
+ while (1) {
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ if (bit == -1)
+ break;
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ ASSERT(nbits > 0);
+ ASSERT(item->ri_buf[i].i_addr != NULL);
+ ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+ ASSERT(BBTOB(bp->b_length) >=
+ ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
+
+ /*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+ /*
+ * Do a sanity check if this is a dquot buffer. Just checking
+ * the first dquot in the buffer should do. XXXThis is
+ * probably a good thing to do for other buf types also.
+ */
+ fa = NULL;
+ if (buf_f->blf_flags &
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+ if (item->ri_buf[i].i_addr == NULL) {
+ xfs_alert(mp,
+ "XFS: NULL dquot in %s.", __func__);
+ goto next;
+ }
+ if (item->ri_buf[i].i_len < size_disk_dquot) {
+ xfs_alert(mp,
+ "XFS: dquot too small (%d) in %s.",
+ item->ri_buf[i].i_len, __func__);
+ goto next;
+ }
+ fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
+ if (fa) {
+ xfs_alert(mp,
+ "dquot corrupt at %pS trying to replay into block 0x%llx",
+ fa, xfs_buf_daddr(bp));
+ goto next;
+ }
+ }
+
+ memcpy(xfs_buf_offset(bp,
+ (uint)bit << XFS_BLF_SHIFT), /* dest */
+ item->ri_buf[i].i_addr, /* source */
+ nbits<<XFS_BLF_SHIFT); /* length */
+ next:
+ i++;
+ bit += nbits;
+ }
+
+ /* Shouldn't be any more regions */
+ ASSERT(i == item->ri_total);
+
+ xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
+}
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
+ */
+STATIC bool
+xlog_recover_do_dquot_buffer(
+ struct xfs_mount *mp,
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
+{
+ uint type;
+
+ trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
+ /*
+ * Filesystems are required to send in quota flags at mount time.
+ */
+ if (!mp->m_qflags)
+ return false;
+
+ type = 0;
+ if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
+ type |= XFS_DQTYPE_USER;
+ if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
+ type |= XFS_DQTYPE_PROJ;
+ if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
+ type |= XFS_DQTYPE_GROUP;
+ /*
+ * This type of quotas was turned off, so ignore this buffer
+ */
+ if (log->l_quotaoffs_flag & type)
+ return false;
+
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
+ return true;
+}
+
+/*
+ * Perform recovery for a buffer full of inodes. In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures. The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ *
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes. In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+ struct xfs_mount *mp,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
+{
+ int i;
+ int item_index = 0;
+ int bit = 0;
+ int nbits = 0;
+ int reg_buf_offset = 0;
+ int reg_buf_bytes = 0;
+ int next_unlinked_offset;
+ int inodes_per_buf;
+ xfs_agino_t *logged_nextp;
+ xfs_agino_t *buffer_nextp;
+
+ trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
+ /*
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
+ */
+ if (xfs_has_crc(mp))
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
+ for (i = 0; i < inodes_per_buf; i++) {
+ next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ while (next_unlinked_offset >=
+ (reg_buf_offset + reg_buf_bytes)) {
+ /*
+ * The next di_next_unlinked field is beyond
+ * the current logged region. Find the next
+ * logged region that contains or is beyond
+ * the current di_next_unlinked field.
+ */
+ bit += nbits;
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+
+ /*
+ * If there are no more logged regions in the
+ * buffer, then we're done.
+ */
+ if (bit == -1)
+ return 0;
+
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ ASSERT(nbits > 0);
+ reg_buf_offset = bit << XFS_BLF_SHIFT;
+ reg_buf_bytes = nbits << XFS_BLF_SHIFT;
+ item_index++;
+ }
+
+ /*
+ * If the current logged region starts after the current
+ * di_next_unlinked field, then move on to the next
+ * di_next_unlinked field.
+ */
+ if (next_unlinked_offset < reg_buf_offset)
+ continue;
+
+ ASSERT(item->ri_buf[item_index].i_addr != NULL);
+ ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+ ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
+
+ /*
+ * The current logged region contains a copy of the
+ * current di_next_unlinked field. Extract its value
+ * and copy it to the buffer copy.
+ */
+ logged_nextp = item->ri_buf[item_index].i_addr +
+ next_unlinked_offset - reg_buf_offset;
+ if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
+ xfs_alert(mp,
+ "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
+ "Trying to replay bad (0) inode di_next_unlinked field.",
+ item, bp);
+ return -EFSCORRUPTED;
+ }
+
+ buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
+ *buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp,
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
+ }
+
+ return 0;
+}
+
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number. If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
+{
+ uint32_t magic32;
+ uint16_t magic16;
+ uint16_t magicda;
+ void *blk = bp->b_addr;
+ uuid_t *uuid;
+ xfs_lsn_t lsn = -1;
+ uint16_t blft;
+
+ /* v4 filesystems always recover immediately */
+ if (!xfs_has_crc(mp))
+ goto recover_immediately;
+
+ /*
+ * realtime bitmap and summary file blocks do not have magic numbers or
+ * UUIDs, so we must recover them immediately.
+ */
+ blft = xfs_blft_from_flags(buf_f);
+ if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
+ goto recover_immediately;
+
+ magic32 = be32_to_cpu(*(__be32 *)blk);
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ case XFS_RMAP_CRC_MAGIC:
+ case XFS_REFC_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+ uuid = &btb->bb_u.s.bb_uuid;
+ break;
+ }
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+ uuid = &btb->bb_u.l.bb_uuid;
+ break;
+ }
+ case XFS_AGF_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ uuid = &((struct xfs_agf *)blk)->agf_uuid;
+ break;
+ case XFS_AGFL_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+ break;
+ case XFS_AGI_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ uuid = &((struct xfs_agi *)blk)->agi_uuid;
+ break;
+ case XFS_SYMLINK_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+ break;
+ case XFS_DIR3_BLOCK_MAGIC:
+ case XFS_DIR3_DATA_MAGIC:
+ case XFS_DIR3_FREE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+ break;
+ case XFS_ATTR3_RMT_MAGIC:
+ /*
+ * Remote attr blocks are written synchronously, rather than
+ * being logged. That means they do not contain a valid LSN
+ * (i.e. transactionally ordered) in them, and hence any time we
+ * see a buffer to replay over the top of a remote attribute
+ * block we should simply do so.
+ */
+ goto recover_immediately;
+ case XFS_SB_MAGIC:
+ /*
+ * superblock uuids are magic. We may or may not have a
+ * sb_meta_uuid on disk, but it will be set in the in-core
+ * superblock. We set the uuid pointer for verification
+ * according to the superblock feature mask to ensure we check
+ * the relevant UUID in the superblock.
+ */
+ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+ if (xfs_has_metauuid(mp))
+ uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+ else
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+ switch (magicda) {
+ case XFS_DIR3_LEAF1_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ /*
+ * We do individual object checks on dquot and inode buffers as they
+ * have their own individual LSN records. Also, we could have a stale
+ * buffer here, so we have to at least recognise these buffer types.
+ *
+ * A notd complexity here is inode unlinked list processing - it logs
+ * the inode directly in the buffer, but we don't know which inodes have
+ * been modified, and there is no global buffer LSN. Hence we need to
+ * recover all inode buffer types immediately. This problem will be
+ * fixed by logical logging of the unlinked list modifications.
+ */
+ magic16 = be16_to_cpu(*(__be16 *)blk);
+ switch (magic16) {
+ case XFS_DQUOT_MAGIC:
+ case XFS_DINODE_MAGIC:
+ goto recover_immediately;
+ default:
+ break;
+ }
+
+ /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+ return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently. Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields. This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery. During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table. See xlog_recover_buf_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_buf_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp;
+ int error;
+ uint buf_flags;
+ xfs_lsn_t lsn;
+
+ /*
+ * In this pass we only want to recover all the buffers which have
+ * not been cancelled and are not cancellation buffers themselves.
+ */
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
+ if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len))
+ goto cancelled;
+ } else {
+
+ if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len))
+ goto cancelled;
+ }
+
+ trace_xfs_log_recover_buf_recover(log, buf_f);
+
+ buf_flags = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ buf_flags |= XBF_UNMAPPED;
+
+ error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+ buf_flags, &bp, NULL);
+ if (error)
+ return error;
+
+ /*
+ * Recover the buffer only if we get an LSN from it and it's less than
+ * the lsn of the transaction we are replaying.
+ *
+ * Note that we have to be extremely careful of readahead here.
+ * Readahead does not attach verfiers to the buffers so if we don't
+ * actually do any replay after readahead because of the LSN we found
+ * in the buffer if more recent than that current transaction then we
+ * need to attach the verifier directly. Failure to do so can lead to
+ * future recovery actions (e.g. EFI and unlinked list recovery) can
+ * operate on the buffers and they won't get the verifier attached. This
+ * can lead to blocks on disk having the correct content but a stale
+ * CRC.
+ *
+ * It is safe to assume these clean buffers are currently up to date.
+ * If the buffer is dirtied by a later transaction being replayed, then
+ * the verifier will be reset to match whatever recover turns that
+ * buffer into.
+ */
+ lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f);
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ trace_xfs_log_recover_buf_skip(log, buf_f);
+ xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+ goto out_release;
+ }
+
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+ if (error)
+ goto out_release;
+ } else if (buf_f->blf_flags &
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+ bool dirty;
+
+ dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+ if (!dirty)
+ goto out_release;
+ } else {
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
+ }
+
+ /*
+ * Perform delayed write on the buffer. Asynchronous writes will be
+ * slower when taking into account all the buffers to be flushed.
+ *
+ * Also make sure that only inode buffers with good sizes stay in
+ * the buffer cache. The kernel moves inodes in buffers of 1 block
+ * or inode_cluster_size bytes, whichever is bigger. The inode
+ * buffers in the log can be a different size if the log was generated
+ * by an older kernel using unclustered inode buffers or a newer kernel
+ * running with a different inode cluster size. Regardless, if
+ * the inode buffer size isn't max(blocksize, inode_cluster_size)
+ * for *our* value of inode_cluster_size, then we need to keep
+ * the buffer out of the buffer cache so that the buffer won't
+ * overlap with future reads of those inodes.
+ */
+ if (XFS_DINODE_MAGIC ==
+ be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
+ (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
+ xfs_buf_stale(bp);
+ error = xfs_bwrite(bp);
+ } else {
+ ASSERT(bp->b_mount == mp);
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(bp, buffer_list);
+ }
+
+out_release:
+ xfs_buf_relse(bp);
+ return error;
+cancelled:
+ trace_xfs_log_recover_buf_cancel(log, buf_f);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_buf_item_ops = {
+ .item_type = XFS_LI_BUF,
+ .reorder = xlog_recover_buf_reorder,
+ .ra_pass2 = xlog_recover_buf_ra_pass2,
+ .commit_pass1 = xlog_recover_buf_commit_pass1,
+ .commit_pass2 = xlog_recover_buf_commit_pass2,
+};
+
+#ifdef DEBUG
+void
+xlog_check_buf_cancel_table(
+ struct xlog *log)
+{
+ int i;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+}
+#endif
+
+int
+xlog_alloc_buf_cancel_table(
+ struct xlog *log)
+{
+ void *p;
+ int i;
+
+ ASSERT(log->l_buf_cancel_table == NULL);
+
+ p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ log->l_buf_cancel_table = p;
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+ return 0;
+}
+
+void
+xlog_free_buf_cancel_table(
+ struct xlog *log)
+{
+ int i;
+
+ if (!log->l_buf_cancel_table)
+ return;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+ struct xfs_buf_cancel *bc;
+
+ while ((bc = list_first_entry_or_null(
+ &log->l_buf_cancel_table[i],
+ struct xfs_buf_cancel, bc_list))) {
+ list_del(&bc->bc_list);
+ kmem_free(bc);
+ }
+ }
+
+ kmem_free(log->l_buf_cancel_table);
+ log->l_buf_cancel_table = NULL;
+}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 0d3b640cf1cc..9f3ceb461515 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -32,7 +32,7 @@ xfs_dir3_get_dtype(
struct xfs_mount *mp,
uint8_t filetype)
{
- if (!xfs_sb_version_hasftype(&mp->m_sb))
+ if (!xfs_has_ftype(mp))
return DT_UNKNOWN;
if (filetype >= XFS_DIR3_FT_MAX)
@@ -57,8 +57,8 @@ xfs_dir2_sf_getdents(
xfs_ino_t ino;
struct xfs_da_geometry *geo = args->geo;
- ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -138,7 +138,8 @@ xfs_dir2_sf_getdents(
STATIC int
xfs_dir2_block_getdents(
struct xfs_da_args *args,
- struct dir_context *ctx)
+ struct dir_context *ctx,
+ unsigned int *lock_mode)
{
struct xfs_inode *dp = args->dp; /* incore directory inode */
struct xfs_buf *bp; /* buffer for block */
@@ -146,8 +147,7 @@ xfs_dir2_block_getdents(
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
- int lock_mode;
- unsigned int offset;
+ unsigned int offset, next_offset;
unsigned int end;
/*
@@ -156,12 +156,13 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
- lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir3_block_read(args->trans, dp, &bp);
- xfs_iunlock(dp, lock_mode);
if (error)
return error;
+ xfs_iunlock(dp, *lock_mode);
+ *lock_mode = 0;
+
/*
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
@@ -173,9 +174,10 @@ xfs_dir2_block_getdents(
* Loop over the data portion of the block.
* Each object is a real entry (dep) or an unused one (dup).
*/
- offset = geo->data_entry_offset;
end = xfs_dir3_data_end_offset(geo, bp->b_addr);
- while (offset < end) {
+ for (offset = geo->data_entry_offset;
+ offset < end;
+ offset = next_offset) {
struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
uint8_t filetype;
@@ -184,14 +186,15 @@ xfs_dir2_block_getdents(
* Unused, skip it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- offset += be16_to_cpu(dup->length);
+ next_offset = offset + be16_to_cpu(dup->length);
continue;
}
/*
* Bump pointer for the next iteration.
*/
- offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
+ next_offset = offset +
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
/*
* The entry is before the desired starting point, skip it.
@@ -245,7 +248,7 @@ xfs_dir2_leaf_readbuf(
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
struct xfs_da_geometry *geo = args->geo;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
struct xfs_bmbt_irec map;
struct blk_plug plug;
xfs_dir2_off_t new_off;
@@ -256,11 +259,9 @@ xfs_dir2_leaf_readbuf(
int ra_want;
int error = 0;
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
- if (error)
- goto out;
- }
+ error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
+ if (error)
+ goto out;
/*
* Look for mapped directory blocks at or above the current offset.
@@ -344,7 +345,8 @@ STATIC int
xfs_dir2_leaf_getdents(
struct xfs_da_args *args,
struct dir_context *ctx,
- size_t bufsize)
+ size_t bufsize,
+ unsigned int *lock_mode)
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
@@ -356,7 +358,6 @@ xfs_dir2_leaf_getdents(
xfs_dir2_off_t curoff; /* current overall offset */
int length; /* temporary length value */
int byteoff; /* offset in current block */
- int lock_mode;
unsigned int offset = 0;
int error = 0; /* error return value */
@@ -390,13 +391,16 @@ xfs_dir2_leaf_getdents(
bp = NULL;
}
- lock_mode = xfs_ilock_data_map_shared(dp);
+ if (*lock_mode == 0)
+ *lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
&rablk, &bp);
- xfs_iunlock(dp, lock_mode);
if (error || !bp)
break;
+ xfs_iunlock(dp, *lock_mode);
+ *lock_mode = 0;
+
xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
@@ -496,7 +500,7 @@ xfs_dir2_leaf_getdents(
*
* If supplied, the transaction collects locked dir buffers to avoid
* nested buffer deadlocks. This function does not dirty the
- * transaction. The caller should ensure that the inode is locked
+ * transaction. The caller must hold the IOLOCK (shared or exclusive)
* before calling this function.
*/
int
@@ -507,29 +511,40 @@ xfs_readdir(
size_t bufsize)
{
struct xfs_da_args args = { NULL };
- int rval;
- int v;
+ unsigned int lock_mode;
+ bool isblock;
+ int error;
trace_xfs_readdir(dp);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
args.trans = tp;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(&args, ctx);
- else if ((rval = xfs_dir2_isblock(&args, &v)))
- ;
- else if (v)
- rval = xfs_dir2_block_getdents(&args, ctx);
- else
- rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ return xfs_dir2_sf_getdents(&args, ctx);
+
+ lock_mode = xfs_ilock_data_map_shared(dp);
+ error = xfs_dir2_isblock(&args, &isblock);
+ if (error)
+ goto out_unlock;
- return rval;
+ if (isblock) {
+ error = xfs_dir2_block_getdents(&args, ctx, &lock_mode);
+ goto out_unlock;
+ }
+
+ error = xfs_dir2_leaf_getdents(&args, ctx, bufsize, &lock_mode);
+
+out_unlock:
+ if (lock_mode)
+ xfs_iunlock(dp, lock_mode);
+ return error;
}
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 0b8350e84d28..bfc829c07f03 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -8,7 +8,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
@@ -18,6 +17,7 @@
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
STATIC int
xfs_trim_extents(
@@ -31,6 +31,7 @@ xfs_trim_extents(
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
+ struct xfs_agf *agf;
struct xfs_perag *pag;
int error;
int i;
@@ -44,17 +45,17 @@ xfs_trim_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
if (error)
goto out_put_perag;
+ agf = agbp->b_addr;
- cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
/*
* Look up the longest btree in the AGF and start with it.
*/
- error = xfs_alloc_lookup_ge(cur, 0,
- be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+ error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
if (error)
goto out_del_cursor;
@@ -75,7 +76,7 @@ xfs_trim_extents(
error = -EFSCORRUPTED;
goto out_del_cursor;
}
- ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+ ASSERT(flen <= be32_to_cpu(agf->agf_longest));
/*
* use daddr format for all range/len calculations as that is
@@ -107,13 +108,13 @@ xfs_trim_extents(
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
*/
- if (xfs_extent_busy_search(mp, agno, fbno, flen)) {
+ if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
trace_xfs_discard_busy(mp, agno, fbno, flen);
goto next_extent;
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+ error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -151,8 +152,8 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
- unsigned int granularity = q->limits.discard_granularity;
+ unsigned int granularity =
+ bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
@@ -161,14 +162,14 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
return -EOPNOTSUPP;
/*
* We haven't recovered the log, so we cannot use our bnobt-guided
* storage zapping commands.
*/
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
return -EROFS;
if (copy_from_user(&range, urange, sizeof(range)))
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index d223e1ae90a6..8fb90da89787 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -23,6 +23,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
+#include "xfs_error.h"
/*
* Lock order:
@@ -37,8 +38,8 @@
* otherwise by the lowest id first, see xfs_dqlock2.
*/
-struct kmem_zone *xfs_qm_dqtrxzone;
-static struct kmem_zone *xfs_qm_dqzone;
+struct kmem_cache *xfs_dqtrx_cache;
+static struct kmem_cache *xfs_dquot_cache;
static struct lock_class_key xfs_dquot_group_class;
static struct lock_class_key xfs_dquot_project_class;
@@ -56,7 +57,7 @@ xfs_qm_dqdestroy(
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
- kmem_cache_free(xfs_qm_dqzone, dqp);
+ kmem_cache_free(xfs_dquot_cache, dqp);
}
/*
@@ -66,38 +67,79 @@ xfs_qm_dqdestroy(
*/
void
xfs_qm_adjust_dqlimits(
- struct xfs_mount *mp,
struct xfs_dquot *dq)
{
+ struct xfs_mount *mp = dq->q_mount;
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_disk_dquot *d = &dq->q_core;
struct xfs_def_quota *defq;
int prealloc = 0;
- ASSERT(d->d_id);
- defq = xfs_get_defquota(dq, q);
+ ASSERT(dq->q_id);
+ defq = xfs_get_defquota(q, xfs_dquot_type(dq));
- if (defq->bsoftlimit && !d->d_blk_softlimit) {
- d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
+ if (!dq->q_blk.softlimit) {
+ dq->q_blk.softlimit = defq->blk.soft;
prealloc = 1;
}
- if (defq->bhardlimit && !d->d_blk_hardlimit) {
- d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
+ if (!dq->q_blk.hardlimit) {
+ dq->q_blk.hardlimit = defq->blk.hard;
prealloc = 1;
}
- if (defq->isoftlimit && !d->d_ino_softlimit)
- d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
- if (defq->ihardlimit && !d->d_ino_hardlimit)
- d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
- if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
- d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
- if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
- d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
+ if (!dq->q_ino.softlimit)
+ dq->q_ino.softlimit = defq->ino.soft;
+ if (!dq->q_ino.hardlimit)
+ dq->q_ino.hardlimit = defq->ino.hard;
+ if (!dq->q_rtb.softlimit)
+ dq->q_rtb.softlimit = defq->rtb.soft;
+ if (!dq->q_rtb.hardlimit)
+ dq->q_rtb.hardlimit = defq->rtb.hard;
if (prealloc)
xfs_dquot_set_prealloc_limits(dq);
}
+/* Set the expiration time of a quota's grace period. */
+time64_t
+xfs_dquot_set_timeout(
+ struct xfs_mount *mp,
+ time64_t timeout)
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+
+ return clamp_t(time64_t, timeout, qi->qi_expiry_min,
+ qi->qi_expiry_max);
+}
+
+/* Set the length of the default grace period. */
+time64_t
+xfs_dquot_set_grace_period(
+ time64_t grace)
+{
+ return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX);
+}
+
+/*
+ * Determine if this quota counter is over either limit and set the quota
+ * timers as appropriate.
+ */
+static inline void
+xfs_qm_adjust_res_timer(
+ struct xfs_mount *mp,
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim)
+{
+ ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit);
+
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (res->timer == 0)
+ res->timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + qlim->time);
+ } else {
+ res->timer = 0;
+ }
+}
+
/*
* Check the limits and timers of a dquot and start or reset timers
* if necessary.
@@ -113,91 +155,18 @@ xfs_qm_adjust_dqlimits(
*/
void
xfs_qm_adjust_dqtimers(
- struct xfs_mount *mp,
- struct xfs_disk_dquot *d)
+ struct xfs_dquot *dq)
{
- ASSERT(d->d_id);
-
-#ifdef DEBUG
- if (d->d_blk_hardlimit)
- ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
- be64_to_cpu(d->d_blk_hardlimit));
- if (d->d_ino_hardlimit)
- ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
- be64_to_cpu(d->d_ino_hardlimit));
- if (d->d_rtb_hardlimit)
- ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
- be64_to_cpu(d->d_rtb_hardlimit));
-#endif
-
- if (!d->d_btimer) {
- if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >
- be64_to_cpu(d->d_blk_softlimit))) ||
- (d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >
- be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = cpu_to_be32(ktime_get_real_seconds() +
- mp->m_quotainfo->qi_btimelimit);
- } else {
- d->d_bwarns = 0;
- }
- } else {
- if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <=
- be64_to_cpu(d->d_blk_softlimit))) &&
- (!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <=
- be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = 0;
- }
- }
+ struct xfs_mount *mp = dq->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_def_quota *defq;
- if (!d->d_itimer) {
- if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >
- be64_to_cpu(d->d_ino_softlimit))) ||
- (d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >
- be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = cpu_to_be32(ktime_get_real_seconds() +
- mp->m_quotainfo->qi_itimelimit);
- } else {
- d->d_iwarns = 0;
- }
- } else {
- if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <=
- be64_to_cpu(d->d_ino_softlimit))) &&
- (!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <=
- be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = 0;
- }
- }
+ ASSERT(dq->q_id);
+ defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
- if (!d->d_rtbtimer) {
- if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >
- be64_to_cpu(d->d_rtb_softlimit))) ||
- (d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >
- be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() +
- mp->m_quotainfo->qi_rtbtimelimit);
- } else {
- d->d_rtbwarns = 0;
- }
- } else {
- if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <=
- be64_to_cpu(d->d_rtb_softlimit))) &&
- (!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <=
- be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = 0;
- }
- }
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb);
}
/*
@@ -205,20 +174,40 @@ xfs_qm_adjust_dqtimers(
*/
STATIC void
xfs_qm_init_dquot_blk(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- xfs_buf_t *bp)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type,
+ struct xfs_buf *bp)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- xfs_dqblk_t *d;
- xfs_dqid_t curid;
- int i;
+ struct xfs_dqblk *d;
+ xfs_dqid_t curid;
+ unsigned int qflag;
+ unsigned int blftype;
+ int i;
ASSERT(tp);
ASSERT(xfs_buf_islocked(bp));
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ qflag = XFS_UQUOTA_CHKD;
+ blftype = XFS_BLF_UDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_PROJ:
+ qflag = XFS_PQUOTA_CHKD;
+ blftype = XFS_BLF_PDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_GROUP:
+ qflag = XFS_GQUOTA_CHKD;
+ blftype = XFS_BLF_GDQUOT_BUF;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
d = bp->b_addr;
/*
@@ -230,19 +219,38 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
- d->dd_diskdq.d_flags = type;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ d->dd_diskdq.d_type = type;
+ if (curid > 0 && xfs_has_bigtime(mp))
+ d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME;
+ if (xfs_has_crc(mp)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
}
- xfs_trans_dquot_buf(tp, bp,
- (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
- ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
- XFS_BLF_GDQUOT_BUF)));
- xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+ xfs_trans_dquot_buf(tp, bp, blftype);
+
+ /*
+ * quotacheck uses delayed writes to update all the dquots on disk in an
+ * efficient manner instead of logging the individual dquot changes as
+ * they are made. However if we log the buffer allocated here and crash
+ * after quotacheck while the logged initialisation is still in the
+ * active region of the log, log recovery can replay the dquot buffer
+ * initialisation over the top of the checked dquots and corrupt quota
+ * accounting.
+ *
+ * To avoid this problem, quotacheck cannot log the initialised buffer.
+ * We must still dirty the buffer and write it back before the
+ * allocation transaction clears the log. Therefore, mark the buffer as
+ * ordered instead of logging it directly. This is safe for quotacheck
+ * because it detects and repairs allocated but initialized dquot blocks
+ * in the quota inodes.
+ */
+ if (!(mp->m_qflags & qflag))
+ xfs_trans_ordered_buf(tp, bp);
+ else
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
/*
@@ -255,8 +263,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
{
uint64_t space;
- dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
- dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit;
+ dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit;
if (!dqp->q_prealloc_lo_wmark) {
dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
do_div(dqp->q_prealloc_lo_wmark, 100);
@@ -278,37 +286,52 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
*/
STATIC int
xfs_dquot_disk_alloc(
- struct xfs_trans **tpp,
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_bmbt_irec map;
- struct xfs_trans *tp = *tpp;
- struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = dqp->q_mount;
struct xfs_buf *bp;
- struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+ xfs_dqtype_t qtype = xfs_dquot_type(dqp);
+ struct xfs_inode *quotip = xfs_quota_inode(mp, qtype);
int nmaps = 1;
int error;
trace_xfs_dqalloc(dqp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
+ if (error)
+ return error;
+
xfs_ilock(quotip, XFS_ILOCK_EXCL);
- if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ xfs_trans_ijoin(tp, quotip, 0);
+
+ if (!xfs_this_quota_on(dqp->q_mount, qtype)) {
/*
* Return if this type of quotas is turned off while we didn't
* have an inode lock
*/
- xfs_iunlock(quotip, XFS_ILOCK_EXCL);
- return -ESRCH;
+ error = -ESRCH;
+ goto err_cancel;
}
+ error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, quotip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto err_cancel;
+
/* Create the block mapping. */
- xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
&nmaps);
if (error)
- return error;
+ goto err_cancel;
+
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -323,15 +346,14 @@ xfs_dquot_disk_alloc(
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp);
if (error)
- return error;
+ goto err_cancel;
bp->b_ops = &xfs_dquot_buf_ops;
/*
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
- dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+ xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
@@ -354,16 +376,25 @@ xfs_dquot_disk_alloc(
* is responsible for unlocking any buffer passed back, either
* manually or by committing the transaction. On error, the buffer is
* released and not passed back.
+ *
+ * Keep the quota inode ILOCKed until after the transaction commit to
+ * maintain the atomicity of bmap/rmap updates.
*/
xfs_trans_bhold(tp, bp);
- error = xfs_defer_finish(tpp);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
if (error) {
- xfs_trans_bhold_release(*tpp, bp);
- xfs_trans_brelse(*tpp, bp);
+ xfs_buf_relse(bp);
return error;
}
+
*bpp = bp;
return 0;
+
+err_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+ return error;
}
/*
@@ -378,13 +409,14 @@ xfs_dquot_disk_read(
{
struct xfs_bmbt_irec map;
struct xfs_buf *bp;
- struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+ xfs_dqtype_t qtype = xfs_dquot_type(dqp);
+ struct xfs_inode *quotip = xfs_quota_inode(mp, qtype);
uint lock_mode;
int nmaps = 1;
int error;
lock_mode = xfs_ilock_data_map_shared(quotip);
- if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
+ if (!xfs_this_quota_on(mp, qtype)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
@@ -436,14 +468,14 @@ STATIC struct xfs_dquot *
xfs_dquot_alloc(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type)
+ xfs_dqtype_t type)
{
struct xfs_dquot *dqp;
- dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
+ dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_type = type;
+ dqp->q_id = id;
dqp->q_mount = mp;
INIT_LIST_HEAD(&dqp->q_lru);
mutex_init(&dqp->q_qlock);
@@ -453,7 +485,7 @@ xfs_dquot_alloc(
* Offset of dquot in the (fixed sized) dquot chunk.
*/
dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
/*
* Because we want to use a counting completion, complete
@@ -468,13 +500,13 @@ xfs_dquot_alloc(
* quotas.
*/
switch (type) {
- case XFS_DQ_USER:
+ case XFS_DQTYPE_USER:
/* uses the default lock class */
break;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
break;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
break;
default:
@@ -488,64 +520,123 @@ xfs_dquot_alloc(
return dqp;
}
+/* Check the ondisk dquot's id and type match what the incore dquot expects. */
+static bool
+xfs_dquot_check_type(
+ struct xfs_dquot *dqp,
+ struct xfs_disk_dquot *ddqp)
+{
+ uint8_t ddqp_type;
+ uint8_t dqp_type;
+
+ ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK;
+ dqp_type = xfs_dquot_type(dqp);
+
+ if (be32_to_cpu(ddqp->d_id) != dqp->q_id)
+ return false;
+
+ /*
+ * V5 filesystems always expect an exact type match. V4 filesystems
+ * expect an exact match for user dquots and for non-root group and
+ * project dquots.
+ */
+ if (xfs_has_crc(dqp->q_mount) ||
+ dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0)
+ return ddqp_type == dqp_type;
+
+ /*
+ * V4 filesystems support either group or project quotas, but not both
+ * at the same time. The non-user quota file can be switched between
+ * group and project quota uses depending on the mount options, which
+ * means that we can encounter the other type when we try to load quota
+ * defaults. Quotacheck will soon reset the entire quota file
+ * (including the root dquot) anyway, but don't log scary corruption
+ * reports to dmesg.
+ */
+ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ;
+}
+
/* Copy the in-core quota fields in from the on-disk buffer. */
-STATIC void
+STATIC int
xfs_dquot_from_disk(
struct xfs_dquot *dqp,
struct xfs_buf *bp)
{
struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
+ /*
+ * Ensure that we got the type and ID we were looking for.
+ * Everything else was checked by the dquot buffer verifier.
+ */
+ if (!xfs_dquot_check_type(dqp, ddqp)) {
+ xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata corruption detected at %pS, quota %u",
+ __this_address, dqp->q_id);
+ xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+ return -EFSCORRUPTED;
+ }
+
/* copy everything from disk dquot to the incore dquot */
- memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot));
+ dqp->q_type = ddqp->d_type;
+ dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+
+ dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount);
+ dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
+ dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
+
+ dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
+ dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
+ dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
/*
* Reservation counters are defined as reservation plus current usage
* to avoid having to add every time.
*/
- dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
- dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
- dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+ dqp->q_blk.reserved = dqp->q_blk.count;
+ dqp->q_ino.reserved = dqp->q_ino.count;
+ dqp->q_rtb.reserved = dqp->q_rtb.count;
/* initialize the dquot speculative prealloc thresholds */
xfs_dquot_set_prealloc_limits(dqp);
+ return 0;
}
-/* Allocate and initialize the dquot buffer for this in-core dquot. */
-static int
-xfs_qm_dqread_alloc(
- struct xfs_mount *mp,
- struct xfs_dquot *dqp,
- struct xfs_buf **bpp)
+/* Copy the in-core quota fields into the on-disk buffer. */
+void
+xfs_dquot_to_disk(
+ struct xfs_disk_dquot *ddqp,
+ struct xfs_dquot *dqp)
{
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
- XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
- if (error)
- goto err;
-
- error = xfs_dquot_disk_alloc(&tp, dqp, bpp);
- if (error)
- goto err_cancel;
-
- error = xfs_trans_commit(tp);
- if (error) {
- /*
- * Buffer was held to the transaction, so we have to unlock it
- * manually here because we're not passing it back.
- */
- xfs_buf_relse(*bpp);
- *bpp = NULL;
- goto err;
- }
- return 0;
-
-err_cancel:
- xfs_trans_cancel(tp);
-err:
- return error;
+ ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ ddqp->d_version = XFS_DQUOT_VERSION;
+ ddqp->d_type = dqp->q_type;
+ ddqp->d_id = cpu_to_be32(dqp->q_id);
+ ddqp->d_pad0 = 0;
+ ddqp->d_pad = 0;
+
+ ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit);
+ ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit);
+ ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit);
+ ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit);
+ ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit);
+ ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit);
+
+ ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count);
+ ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
+ ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
+
+ ddqp->d_bwarns = 0;
+ ddqp->d_iwarns = 0;
+ ddqp->d_rtbwarns = 0;
+
+ ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
+ ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
+ ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer);
}
/*
@@ -557,7 +648,7 @@ static int
xfs_qm_dqread(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
bool can_alloc,
struct xfs_dquot **dqpp)
{
@@ -571,7 +662,7 @@ xfs_qm_dqread(
/* Try to read the buffer, allocating if necessary. */
error = xfs_dquot_disk_read(mp, dqp, &bp);
if (error == -ENOENT && can_alloc)
- error = xfs_qm_dqread_alloc(mp, dqp, &bp);
+ error = xfs_dquot_disk_alloc(dqp, &bp);
if (error)
goto err;
@@ -582,9 +673,11 @@ xfs_qm_dqread(
* further.
*/
ASSERT(xfs_buf_islocked(bp));
- xfs_dquot_from_disk(dqp, bp);
-
+ error = xfs_dquot_from_disk(dqp, bp);
xfs_buf_relse(bp);
+ if (error)
+ goto err;
+
*dqpp = dqp;
return error;
@@ -603,7 +696,7 @@ err:
static int
xfs_dq_get_next_id(
struct xfs_mount *mp,
- uint type,
+ xfs_dqtype_t type,
xfs_dqid_t *id)
{
struct xfs_inode *quotip = xfs_quota_inode(mp, type);
@@ -628,11 +721,9 @@ xfs_dq_get_next_id(
start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
lock_flags = xfs_ilock_data_map_shared(quotip);
- if (!(quotip->i_df.if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK);
- if (error)
- return error;
- }
+ error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK);
+ if (error)
+ return error;
if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &cur, &got)) {
/* contiguous chunk, bump startoff for the id calculation */
@@ -671,7 +762,7 @@ restart:
}
xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
+ if (dqp->q_flags & XFS_DQFLAG_FREEING) {
xfs_dqunlock(dqp);
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_freeing(dqp);
@@ -727,21 +818,18 @@ xfs_qm_dqget_cache_insert(
static int
xfs_qm_dqget_checks(
struct xfs_mount *mp,
- uint type)
+ xfs_dqtype_t type)
{
- if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
- return -ESRCH;
-
switch (type) {
- case XFS_DQ_USER:
+ case XFS_DQTYPE_USER:
if (!XFS_IS_UQUOTA_ON(mp))
return -ESRCH;
return 0;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
if (!XFS_IS_GQUOTA_ON(mp))
return -ESRCH;
return 0;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
if (!XFS_IS_PQUOTA_ON(mp))
return -ESRCH;
return 0;
@@ -752,14 +840,14 @@ xfs_qm_dqget_checks(
}
/*
- * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
- * dquot, doing an allocation (if requested) as needed.
+ * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a
+ * locked dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
@@ -809,7 +897,7 @@ int
xfs_qm_dqget_uncached(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct xfs_dquot **dqpp)
{
int error;
@@ -825,15 +913,15 @@ xfs_qm_dqget_uncached(
xfs_dqid_t
xfs_qm_id_for_quotatype(
struct xfs_inode *ip,
- uint type)
+ xfs_dqtype_t type)
{
switch (type) {
- case XFS_DQ_USER:
- return ip->i_d.di_uid;
- case XFS_DQ_GROUP:
- return ip->i_d.di_gid;
- case XFS_DQ_PROJ:
- return ip->i_d.di_projid;
+ case XFS_DQTYPE_USER:
+ return i_uid_read(VFS_I(ip));
+ case XFS_DQTYPE_GROUP:
+ return i_gid_read(VFS_I(ip));
+ case XFS_DQTYPE_PROJ:
+ return ip->i_projid;
}
ASSERT(0);
return 0;
@@ -847,7 +935,7 @@ xfs_qm_id_for_quotatype(
int
xfs_qm_dqget_inode(
struct xfs_inode *ip,
- uint type,
+ xfs_dqtype_t type,
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
@@ -933,7 +1021,7 @@ int
xfs_qm_dqget_next(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct xfs_dquot **dqpp)
{
struct xfs_dquot *dqp;
@@ -1013,14 +1101,14 @@ xfs_qm_dqrele(
* from the AIL if it has not been re-logged, and unlocking the dquot's
* flush lock. This behavior is very similar to that of inodes..
*/
-STATIC void
+static void
xfs_qm_dqflush_done(
- struct xfs_buf *bp,
struct xfs_log_item *lip)
{
struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip;
struct xfs_dquot *dqp = qip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
+ xfs_lsn_t tail_lsn;
/*
* We only want to pull the item from the AIL if its
@@ -1034,16 +1122,13 @@ xfs_qm_dqflush_done(
((lip->li_lsn == qip->qli_flush_lsn) ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->ail_lock);
+ xfs_clear_li_failed(lip);
if (lip->li_lsn == qip->qli_flush_lsn) {
- xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ /* xfs_ail_update_finish() drops the AIL lock */
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
} else {
- /*
- * Clear the failed state since we are about to drop the
- * flush lock
- */
- xfs_clear_li_failed(lip);
spin_unlock(&ailp->ail_lock);
}
}
@@ -1054,6 +1139,68 @@ xfs_qm_dqflush_done(
xfs_dqfunlock(dqp);
}
+void
+xfs_buf_dquot_iodone(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip, *n;
+
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ list_del_init(&lip->li_bio_list);
+ xfs_qm_dqflush_done(lip);
+ }
+}
+
+void
+xfs_buf_dquot_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ spin_lock(&bp->b_mount->m_ail->ail_lock);
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ xfs_set_li_failed(lip, bp);
+ spin_unlock(&bp->b_mount->m_ail->ail_lock);
+}
+
+/* Check incore dquot for errors before we flush. */
+static xfs_failaddr_t
+xfs_qm_dqflush_check(
+ struct xfs_dquot *dqp)
+{
+ xfs_dqtype_t type = xfs_dquot_type(dqp);
+
+ if (type != XFS_DQTYPE_USER &&
+ type != XFS_DQTYPE_GROUP &&
+ type != XFS_DQTYPE_PROJ)
+ return __this_address;
+
+ if (dqp->q_id == 0)
+ return NULL;
+
+ if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit &&
+ !dqp->q_blk.timer)
+ return __this_address;
+
+ if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit &&
+ !dqp->q_ino.timer)
+ return __this_address;
+
+ if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit &&
+ !dqp->q_rtb.timer)
+ return __this_address;
+
+ /* bigtime flag should never be set on root dquots */
+ if (dqp->q_type & XFS_DQTYPE_BIGTIME) {
+ if (!xfs_has_bigtime(dqp->q_mount))
+ return __this_address;
+ if (dqp->q_id == 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
/*
* Write a modified dquot to disk.
* The dquot must be locked and the flush lock too taken by caller.
@@ -1068,9 +1215,9 @@ xfs_qm_dqflush(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
struct xfs_buf *bp;
- struct xfs_dqblk *dqb;
- struct xfs_disk_dquot *ddqp;
+ struct xfs_dqblk *dqblk;
xfs_failaddr_t fa;
int error;
@@ -1084,58 +1231,33 @@ xfs_qm_dqflush(
xfs_qm_dqunpin_wait(dqp);
/*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this dquot
- * to disk, because the log record didn't make it to disk.
- *
- * We also have to remove the log item from the AIL in this case,
- * as we wait for an emptry AIL as part of the unmount process.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
- dqp->dq_flags &= ~XFS_DQ_DIRTY;
-
- xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE);
-
- error = -EIO;
- goto out_unlock;
- }
-
- /*
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp,
- &xfs_dquot_buf_ops);
- if (error)
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ &bp, &xfs_dquot_buf_ops);
+ if (error == -EAGAIN)
goto out_unlock;
+ if (error)
+ goto out_abort;
- /*
- * Calculate the location of the dquot inside the buffer.
- */
- dqb = bp->b_addr + dqp->q_bufoffset;
- ddqp = &dqb->dd_diskdq;
-
- /*
- * A simple sanity check in case we got a corrupted dquot.
- */
- fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0);
+ fa = xfs_qm_dqflush_check(dqp);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
- be32_to_cpu(ddqp->d_id), fa);
+ dqp->q_id, fa);
xfs_buf_relse(bp);
- xfs_dqfunlock(dqp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return -EFSCORRUPTED;
+ error = -EFSCORRUPTED;
+ goto out_abort;
}
- /* This is the only portion of data that needs to persist */
- memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot));
+ /* Flush the incore dquot to the ondisk buffer. */
+ dqblk = bp->b_addr + dqp->q_bufoffset;
+ xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
/*
* Clear the dirty field and remember the flush lsn for later use.
*/
- dqp->dq_flags &= ~XFS_DQ_DIRTY;
+ dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
&dqp->q_logitem.qli_item.li_lsn);
@@ -1149,18 +1271,18 @@ xfs_qm_dqflush(
* buffer always has a valid CRC. This ensures there is no possibility
* of a dquot without an up-to-date CRC getting to disk.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
- xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ if (xfs_has_crc(mp)) {
+ dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+ xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
/*
- * Attach an iodone routine so that we can remove this dquot from the
- * AIL and release the flush lock once the dquot is synced to disk.
+ * Attach the dquot to the buffer so that we can remove this dquot from
+ * the AIL and release the flush lock once the dquot is synced to disk.
*/
- xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
- &dqp->q_logitem.qli_item);
+ bp->b_flags |= _XBF_DQUOTS;
+ list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list);
/*
* If the buffer is pinned then push on the log so we won't
@@ -1175,9 +1297,13 @@ xfs_qm_dqflush(
*bpp = bp;
return 0;
+out_abort:
+ dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
+ xfs_trans_ail_delete(lip, 0);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
out_unlock:
xfs_dqfunlock(dqp);
- return -EIO;
+ return error;
}
/*
@@ -1193,8 +1319,7 @@ xfs_dqlock2(
{
if (d1 && d2) {
ASSERT(d1 != d2);
- if (be32_to_cpu(d1->q_core.d_id) >
- be32_to_cpu(d2->q_core.d_id)) {
+ if (d1->q_id > d2->q_id) {
mutex_lock(&d2->q_qlock);
mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
} else {
@@ -1211,22 +1336,22 @@ xfs_dqlock2(
int __init
xfs_qm_init(void)
{
- xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
+ xfs_dquot_cache = kmem_cache_create("xfs_dquot",
sizeof(struct xfs_dquot),
0, 0, NULL);
- if (!xfs_qm_dqzone)
+ if (!xfs_dquot_cache)
goto out;
- xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
+ xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx",
sizeof(struct xfs_dquot_acct),
0, 0, NULL);
- if (!xfs_qm_dqtrxzone)
- goto out_free_dqzone;
+ if (!xfs_dqtrx_cache)
+ goto out_free_dquot_cache;
return 0;
-out_free_dqzone:
- kmem_cache_destroy(xfs_qm_dqzone);
+out_free_dquot_cache:
+ kmem_cache_destroy(xfs_dquot_cache);
out:
return -ENOMEM;
}
@@ -1234,8 +1359,8 @@ out:
void
xfs_qm_exit(void)
{
- kmem_cache_destroy(xfs_qm_dqtrxzone);
- kmem_cache_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_dqtrx_cache);
+ kmem_cache_destroy(xfs_dquot_cache);
}
/*
@@ -1246,7 +1371,7 @@ xfs_qm_exit(void)
int
xfs_qm_dqiterate(
struct xfs_mount *mp,
- uint dqtype,
+ xfs_dqtype_t type,
xfs_qm_dqiterate_fn iter_fn,
void *priv)
{
@@ -1255,16 +1380,15 @@ xfs_qm_dqiterate(
int error;
do {
- error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
+ error = xfs_qm_dqget_next(mp, id, type, &dq);
if (error == -ENOENT)
return 0;
if (error)
return error;
- error = iter_fn(dq, dqtype, priv);
- id = be32_to_cpu(dq->q_core.d_id);
+ error = iter_fn(dq, type, priv);
+ id = dq->q_id;
xfs_qm_dqput(dq);
- id++;
} while (error == 0 && id != 0);
return error;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index fe3e46df604b..80c8f851a2f3 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -27,26 +27,55 @@ enum {
XFS_QLOWSP_MAX
};
+struct xfs_dquot_res {
+ /* Total resources allocated and reserved. */
+ xfs_qcnt_t reserved;
+
+ /* Total resources allocated. */
+ xfs_qcnt_t count;
+
+ /* Absolute and preferred limits. */
+ xfs_qcnt_t hardlimit;
+ xfs_qcnt_t softlimit;
+
+ /*
+ * For root dquots, this is the default grace period, in seconds.
+ * Otherwise, this is when the quota grace period expires,
+ * in seconds since the Unix epoch.
+ */
+ time64_t timer;
+};
+
+static inline bool
+xfs_dquot_res_over_limits(
+ const struct xfs_dquot_res *qres)
+{
+ if ((qres->softlimit && qres->softlimit < qres->reserved) ||
+ (qres->hardlimit && qres->hardlimit < qres->reserved))
+ return true;
+ return false;
+}
+
/*
* The incore dquot structure
*/
struct xfs_dquot {
- uint dq_flags;
struct list_head q_lru;
struct xfs_mount *q_mount;
+ xfs_dqtype_t q_type;
+ uint16_t q_flags;
+ xfs_dqid_t q_id;
uint q_nrefs;
- xfs_daddr_t q_blkno;
int q_bufoffset;
+ xfs_daddr_t q_blkno;
xfs_fileoff_t q_fileoffset;
- struct xfs_disk_dquot q_core;
+ struct xfs_dquot_res q_blk; /* regular blocks */
+ struct xfs_dquot_res q_ino; /* inodes */
+ struct xfs_dquot_res q_rtb; /* realtime blocks */
+
struct xfs_dq_logitem q_logitem;
- /* total regular nblks used+reserved */
- xfs_qcnt_t q_res_bcount;
- /* total inos allocd+reserved */
- xfs_qcnt_t q_res_icount;
- /* total realtime blks used+reserved */
- xfs_qcnt_t q_res_rtbcount;
+
xfs_qcnt_t q_prealloc_lo_wmark;
xfs_qcnt_t q_prealloc_hi_wmark;
int64_t q_low_space[XFS_QLOWSP_MAX];
@@ -101,34 +130,59 @@ static inline void xfs_dqunlock(struct xfs_dquot *dqp)
mutex_unlock(&dqp->q_qlock);
}
-static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+static inline int
+xfs_dquot_type(const struct xfs_dquot *dqp)
{
- switch (type & XFS_DQ_ALLTYPES) {
- case XFS_DQ_USER:
+ return dqp->q_type & XFS_DQTYPE_REC_MASK;
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
return XFS_IS_UQUOTA_ON(mp);
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return XFS_IS_GQUOTA_ON(mp);
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return XFS_IS_PQUOTA_ON(mp);
default:
return 0;
}
}
-static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type)
+static inline struct xfs_dquot *xfs_inode_dquot(
+ struct xfs_inode *ip,
+ xfs_dqtype_t type)
{
- switch (type & XFS_DQ_ALLTYPES) {
- case XFS_DQ_USER:
+ switch (type) {
+ case XFS_DQTYPE_USER:
return ip->i_udquot;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return ip->i_gdquot;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return ip->i_pdquot;
default:
return NULL;
}
}
+/* Decide if the dquot's limits are actually being enforced. */
+static inline bool
+xfs_dquot_is_enforced(
+ const struct xfs_dquot *dqp)
+{
+ switch (xfs_dquot_type(dqp)) {
+ case XFS_DQTYPE_USER:
+ return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount);
+ case XFS_DQTYPE_GROUP:
+ return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount);
+ case XFS_DQTYPE_PROJ:
+ return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount);
+ }
+ ASSERT(0);
+ return false;
+}
+
/*
* Check whether a dquot is under low free space conditions. We assume the quota
* is enabled and enforced.
@@ -137,38 +191,35 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
{
int64_t freesp;
- freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+ freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved;
if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
return true;
return false;
}
+void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
+
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
-#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
-#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
-#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
+#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY)
void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
-void xfs_qm_adjust_dqtimers(struct xfs_mount *mp,
- struct xfs_disk_dquot *d);
-void xfs_qm_adjust_dqlimits(struct xfs_mount *mp,
- struct xfs_dquot *d);
-xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type);
+void xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
+void xfs_qm_adjust_dqlimits(struct xfs_dquot *d);
+xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+ xfs_dqtype_t type);
int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
- uint type, bool can_alloc,
- struct xfs_dquot **dqpp);
-int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
- bool can_alloc,
- struct xfs_dquot **dqpp);
+ xfs_dqtype_t type, bool can_alloc,
+ struct xfs_dquot **dqpp);
+int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type,
+ bool can_alloc, struct xfs_dquot **dqpp);
int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
- uint type, struct xfs_dquot **dqpp);
+ xfs_dqtype_t type, struct xfs_dquot **dqpp);
int xfs_qm_dqget_uncached(struct xfs_mount *mp,
- xfs_dqid_t id, uint type,
- struct xfs_dquot **dqpp);
+ xfs_dqid_t id, xfs_dqtype_t type,
+ struct xfs_dquot **dqpp);
void xfs_qm_dqput(struct xfs_dquot *dqp);
void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
@@ -183,9 +234,12 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype,
- void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype,
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
+ xfs_dqtype_t type, void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
xfs_qm_dqiterate_fn iter_fn, void *priv);
+time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
+time64_t xfs_dquot_set_grace_period(time64_t grace);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index d60647d7197b..6a1aae799cf1 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -45,6 +45,7 @@ xfs_qm_dquot_logitem_format(
struct xfs_log_item *lip,
struct xfs_log_vec *lv)
{
+ struct xfs_disk_dquot ddq;
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
struct xfs_log_iovec *vecp = NULL;
struct xfs_dq_logformat *qlf;
@@ -52,14 +53,15 @@ xfs_qm_dquot_logitem_format(
qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
qlf->qlf_type = XFS_LI_DQUOT;
qlf->qlf_size = 2;
- qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+ qlf->qlf_id = qlip->qli_dquot->q_id;
qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
qlf->qlf_len = 1;
qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
- xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
- &qlip->qli_dquot->q_core,
+ xfs_dquot_to_disk(&ddq, qlip->qli_dquot);
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq,
sizeof(struct xfs_disk_dquot));
}
@@ -113,23 +115,6 @@ xfs_qm_dqunpin_wait(
wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
}
-/*
- * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
- * have been failed during writeback
- *
- * this informs the AIL that the dquot is already flush locked on the next push,
- * and acquires a hold on the buffer to ensure that it isn't reclaimed before
- * dirty data makes it to disk.
- */
-STATIC void
-xfs_dquot_item_error(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush));
- xfs_set_li_failed(lip, bp);
-}
-
STATIC uint
xfs_qm_dquot_logitem_push(
struct xfs_log_item *lip,
@@ -145,21 +130,6 @@ xfs_qm_dquot_logitem_push(
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
- /*
- * The buffer containing this item failed to be written back
- * previously. Resubmit the buffer for IO
- */
- if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
- if (!xfs_buf_trylock(bp))
- return XFS_ITEM_LOCKED;
-
- if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list))
- rval = XFS_ITEM_FLUSHING;
-
- xfs_buf_unlock(bp);
- return rval;
- }
-
if (!xfs_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
@@ -189,7 +159,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -217,7 +188,7 @@ xfs_qm_dquot_logitem_release(
STATIC void
xfs_qm_dquot_logitem_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
return xfs_qm_dquot_logitem_release(lip);
}
@@ -230,7 +201,6 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_release = xfs_qm_dquot_logitem_release,
.iop_committing = xfs_qm_dquot_logitem_committing,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_error = xfs_dquot_item_error
};
/*
@@ -248,111 +218,3 @@ xfs_qm_dquot_logitem_init(
&xfs_dquot_item_ops);
lp->qli_dquot = dqp;
}
-
-/*------------------ QUOTAOFF LOG ITEMS -------------------*/
-
-static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
-{
- return container_of(lip, struct xfs_qoff_logitem, qql_item);
-}
-
-
-/*
- * This returns the number of iovecs needed to log the given quotaoff item.
- * We only need 1 iovec for an quotaoff item. It just logs the
- * quotaoff_log_format structure.
- */
-STATIC void
-xfs_qm_qoff_logitem_size(
- struct xfs_log_item *lip,
- int *nvecs,
- int *nbytes)
-{
- *nvecs += 1;
- *nbytes += sizeof(struct xfs_qoff_logitem);
-}
-
-STATIC void
-xfs_qm_qoff_logitem_format(
- struct xfs_log_item *lip,
- struct xfs_log_vec *lv)
-{
- struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
- struct xfs_log_iovec *vecp = NULL;
- struct xfs_qoff_logformat *qlf;
-
- qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
- qlf->qf_type = XFS_LI_QUOTAOFF;
- qlf->qf_size = 1;
- qlf->qf_flags = qflip->qql_flags;
- xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
-}
-
-/*
- * There isn't much you can do to push a quotaoff item. It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_qm_qoff_logitem_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_LOCKED;
-}
-
-STATIC xfs_lsn_t
-xfs_qm_qoffend_logitem_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
- struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
- struct xfs_ail *ailp = qfs->qql_item.li_ailp;
-
- /*
- * Delete the qoff-start logitem from the AIL.
- * xfs_trans_ail_delete() drops the AIL lock.
- */
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
-
- kmem_free(qfs->qql_item.li_lv_shadow);
- kmem_free(lip->li_lv_shadow);
- kmem_free(qfs);
- kmem_free(qfe);
- return (xfs_lsn_t)-1;
-}
-
-static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
- .iop_size = xfs_qm_qoff_logitem_size,
- .iop_format = xfs_qm_qoff_logitem_format,
- .iop_committed = xfs_qm_qoffend_logitem_committed,
- .iop_push = xfs_qm_qoff_logitem_push,
-};
-
-static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
- .iop_size = xfs_qm_qoff_logitem_size,
- .iop_format = xfs_qm_qoff_logitem_format,
- .iop_push = xfs_qm_qoff_logitem_push,
-};
-
-/*
- * Allocate and initialize an quotaoff item of the correct quota type(s).
- */
-struct xfs_qoff_logitem *
-xfs_qm_qoff_logitem_init(
- struct xfs_mount *mp,
- struct xfs_qoff_logitem *start,
- uint flags)
-{
- struct xfs_qoff_logitem *qf;
-
- qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
-
- xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
- &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
- qf->qql_item.li_mountp = mp;
- qf->qql_start_lip = start;
- qf->qql_flags = flags;
- return qf;
-}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 3bb19e556ade..794710c24474 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -9,7 +9,6 @@
struct xfs_dquot;
struct xfs_trans;
struct xfs_mount;
-struct xfs_qoff_logitem;
struct xfs_dq_logitem {
struct xfs_log_item qli_item; /* common portion */
@@ -17,21 +16,6 @@ struct xfs_dq_logitem {
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
};
-struct xfs_qoff_logitem {
- struct xfs_log_item qql_item; /* common portion */
- struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
- unsigned int qql_flags;
-};
-
-
void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
-struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp,
- struct xfs_qoff_logitem *start,
- uint flags);
-struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp,
- struct xfs_qoff_logitem *startqoff,
- uint flags);
-void xfs_trans_log_quotaoff_item(struct xfs_trans *tp,
- struct xfs_qoff_logitem *qlp);
#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
new file mode 100644
index 000000000000..8966ba842395
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_disk_dquot *recddq;
+ struct xfs_dq_logformat *dq_f;
+ uint type;
+
+ if (mp->m_qflags == 0)
+ return;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL)
+ return;
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ return;
+
+ type = recddq->d_type & XFS_DQTYPE_REC_MASK;
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return;
+
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ ASSERT(dq_f->qlf_len == 1);
+
+ xlog_buf_readahead(log, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+ &xfs_dquot_buf_ra_ops);
+}
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_dquot_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp;
+ struct xfs_disk_dquot *ddq, *recddq;
+ struct xfs_dq_logformat *dq_f;
+ xfs_failaddr_t fa;
+ int error;
+ uint type;
+
+ /*
+ * Filesystems are required to send in quota flags at mount time.
+ */
+ if (mp->m_qflags == 0)
+ return 0;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL) {
+ xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
+ return -EFSCORRUPTED;
+ }
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
+ xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
+ item->ri_buf[1].i_len, __func__);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * This type of quotas was turned off, so ignore this record.
+ */
+ type = recddq->d_type & XFS_DQTYPE_REC_MASK;
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return 0;
+
+ /*
+ * At this point we know that quota was _not_ turned off.
+ * Since the mount flags are not indicating to us otherwise, this
+ * must mean that quota is on, and the dquot needs to be replayed.
+ * Remember that we may not have fully recovered the superblock yet,
+ * so we can't do the usual trick of looking at the SB quota bits.
+ *
+ * The other possibility, of course, is that the quota subsystem was
+ * removed since the last mount - ENOSYS.
+ */
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
+ if (fa) {
+ xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
+ dq_f->qlf_id, fa);
+ return -EFSCORRUPTED;
+ }
+ ASSERT(dq_f->qlf_len == 1);
+
+ /*
+ * At this point we are assuming that the dquots have been allocated
+ * and hence the buffer has valid dquots stamped in it. It should,
+ * therefore, pass verifier validation. If the dquot is bad, then the
+ * we'll return an error here, so we don't need to specifically check
+ * the dquot in the buffer after the verifier has run.
+ */
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+ &xfs_dquot_buf_ops);
+ if (error)
+ return error;
+
+ ASSERT(bp);
+ ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+ /*
+ * If the dquot has an LSN in it, recover the dquot only if it's less
+ * than the lsn of the transaction we are replaying.
+ */
+ if (xfs_has_crc(mp)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+ xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ goto out_release;
+ }
+ }
+
+ memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_has_crc(mp)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ ASSERT(dq_f->qlf_size == 2);
+ ASSERT(bp->b_mount == mp);
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_dquot_item_ops = {
+ .item_type = XFS_LI_DQUOT,
+ .ra_pass2 = xlog_recover_dquot_ra_pass2,
+ .commit_pass2 = xlog_recover_dquot_commit_pass2,
+};
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_quotaoff_commit_pass1(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr;
+ ASSERT(qoff_f);
+
+ /*
+ * The logitem format's flag tells us if this was user quotaoff,
+ * group/project quotaoff or both.
+ */
+ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQTYPE_USER;
+ if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ;
+ if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP;
+
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_quotaoff_item_ops = {
+ .item_type = XFS_LI_QUOTAOFF,
+ .commit_pass1 = xlog_recover_quotaoff_commit_pass1,
+ /* nothing to commit in pass2 */
+};
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 331765afc53e..c6b2aabd6f18 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -53,6 +53,13 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
XFS_RANDOM_IUNLINK_FALLBACK,
+ XFS_RANDOM_BUF_IOERROR,
+ XFS_RANDOM_REDUCE_MAX_IEXTENTS,
+ XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
+ XFS_RANDOM_AG_RESV_FAIL,
+ XFS_RANDOM_LARP,
+ XFS_RANDOM_DA_LEAF_SPLIT,
+ XFS_RANDOM_ATTR_LEAF_TO_NODE,
};
struct xfs_errortag_attr {
@@ -162,6 +169,13 @@ XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
+XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
+XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
+XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
+XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -199,26 +213,39 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
+ XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
+ XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
+ XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
+ XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+ XFS_ERRORTAG_ATTR_LIST(larp),
+ XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+ XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
NULL,
};
+ATTRIBUTE_GROUPS(xfs_errortag);
static struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_errortag_sysfs_ops,
- .default_attrs = xfs_errortag_attrs,
+ .default_groups = xfs_errortag_groups,
};
int
xfs_errortag_init(
struct xfs_mount *mp)
{
+ int ret;
+
mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
KM_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
- return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
- &mp->m_kobj, "errortag");
+ ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
+ &mp->m_kobj, "errortag");
+ if (ret)
+ kmem_free(mp->m_errortag);
+ return ret;
}
void
@@ -252,7 +279,7 @@ xfs_errortag_test(
ASSERT(error_tag < XFS_ERRTAG_MAX);
randfactor = mp->m_errortag[error_tag];
- if (!randfactor || prandom_u32() % randfactor)
+ if (!randfactor || prandom_u32_max(randfactor))
return false;
xfs_warn_ratelimited(mp,
@@ -290,6 +317,8 @@ xfs_errortag_add(
struct xfs_mount *mp,
unsigned int error_tag)
{
+ BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
+
if (error_tag >= XFS_ERRTAG_MAX)
return -EINVAL;
@@ -345,16 +374,19 @@ xfs_corruption_error(
* Complain about the kinds of metadata corruption that we can't detect from a
* verifier, such as incorrect inter-block relationship data. Does not set
* bp->b_error.
+ *
+ * Call xfs_buf_mark_corrupt, not this function.
*/
void
xfs_buf_corruption_error(
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
{
struct xfs_mount *mp = bp->b_mount;
xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, %s block 0x%llx",
- __return_address, bp->b_ops->name, bp->b_bn);
+ fa, bp->b_ops->name, xfs_buf_daddr(bp));
xfs_alert(mp, "Unmount and run xfs_repair");
@@ -385,7 +417,7 @@ xfs_buf_verifier_error(
xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
"Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- fa, bp->b_ops->name, bp->b_bn, name);
+ fa, bp->b_ops->name, xfs_buf_daddr(bp), name);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 31a5d321ba9a..5191e9145e55 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -15,7 +15,7 @@ extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, const void *buf, size_t bufsize,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
-void xfs_buf_corruption_error(struct xfs_buf *bp);
+void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
@@ -64,15 +64,27 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
* XFS panic tags -- allow a call to xfs_alert_tag() be turned into
* a panic by setting xfs_panic_mask in a sysctl.
*/
-#define XFS_NO_PTAG 0
-#define XFS_PTAG_IFLUSH 0x00000001
-#define XFS_PTAG_LOGRES 0x00000002
-#define XFS_PTAG_AILDELETE 0x00000004
-#define XFS_PTAG_ERROR_REPORT 0x00000008
-#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
-#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
-#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
-#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
-#define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_NO_PTAG 0u
+#define XFS_PTAG_IFLUSH (1u << 0)
+#define XFS_PTAG_LOGRES (1u << 1)
+#define XFS_PTAG_AILDELETE (1u << 2)
+#define XFS_PTAG_ERROR_REPORT (1u << 3)
+#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4)
+#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5)
+#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6)
+#define XFS_PTAG_FSBLOCK_ZERO (1u << 7)
+#define XFS_PTAG_VERIFIER_ERROR (1u << 8)
+
+#define XFS_PTAG_STRINGS \
+ { XFS_NO_PTAG, "none" }, \
+ { XFS_PTAG_IFLUSH, "iflush" }, \
+ { XFS_PTAG_LOGRES, "logres" }, \
+ { XFS_PTAG_AILDELETE, "aildelete" }, \
+ { XFS_PTAG_ERROR_REPORT , "error_report" }, \
+ { XFS_PTAG_SHUTDOWN_CORRUPT, "corrupt" }, \
+ { XFS_PTAG_SHUTDOWN_IOERROR, "ioerror" }, \
+ { XFS_PTAG_SHUTDOWN_LOGERROR, "logerror" }, \
+ { XFS_PTAG_FSBLOCK_ZERO, "fsb_zero" }, \
+ { XFS_PTAG_VERIFIER_ERROR, "verifier" }
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f1372f9046e3..1064c2342876 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
@@ -45,6 +44,7 @@ xfs_fs_encode_fh(
int *max_len,
struct inode *parent)
{
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
struct fid *fid = (struct fid *)fh;
struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
int fileid_type;
@@ -57,15 +57,14 @@ xfs_fs_encode_fh(
fileid_type = FILEID_INO32_GEN_PARENT;
/*
- * If the the filesystem may contain 64bit inode numbers, we need
+ * If the filesystem may contain 64bit inode numbers, we need
* to use larger file handles that can represent them.
*
* While we only allocate inodes that do not fit into 32 bits any
* large enough filesystem may contain them, thus the slightly
* confusing looking conditional below.
*/
- if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
- (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+ if (!xfs_has_small_inums(mp) || xfs_is_inode32(mp))
fileid_type |= XFS_FILEID_TYPE_64FLAG;
/*
@@ -85,7 +84,7 @@ xfs_fs_encode_fh(
case FILEID_INO32_GEN_PARENT:
fid->i32.parent_ino = XFS_I(parent)->i_ino;
fid->i32.parent_gen = parent->i_generation;
- /*FALLTHRU*/
+ fallthrough;
case FILEID_INO32_GEN:
fid->i32.ino = XFS_I(inode)->i_ino;
fid->i32.gen = inode->i_generation;
@@ -93,7 +92,7 @@ xfs_fs_encode_fh(
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
fid64->parent_ino = XFS_I(parent)->i_ino;
fid64->parent_gen = parent->i_generation;
- /*FALLTHRU*/
+ fallthrough;
case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
fid64->ino = XFS_I(inode)->i_ino;
fid64->gen = inode->i_generation;
@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 3991e59cfd18..ad22a003f959 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -11,39 +11,37 @@
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_log.h"
+#include "xfs_ag.h"
void
xfs_extent_busy_insert(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
unsigned int flags)
{
struct xfs_extent_busy *new;
struct xfs_extent_busy *busyp;
- struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent = NULL;
new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
- new->agno = agno;
+ new->agno = pag->pag_agno;
new->bno = bno;
new->length = len;
INIT_LIST_HEAD(&new->list);
new->flags = flags;
/* trace before insert to be able to see failed inserts */
- trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
+ trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
- pag = xfs_perag_get(tp->t_mountp, new->agno);
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
while (*rbp) {
@@ -66,7 +64,6 @@ xfs_extent_busy_insert(
list_add(&new->list, &tp->t_busy);
spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
}
/*
@@ -81,21 +78,17 @@ xfs_extent_busy_insert(
int
xfs_extent_busy_search(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len)
{
- struct xfs_perag *pag;
struct rb_node *rbp;
struct xfs_extent_busy *busyp;
int match = 0;
- pag = xfs_perag_get(mp, agno);
+ /* find closest start bno overlap */
spin_lock(&pag->pagb_lock);
-
rbp = pag->pagb_tree.rb_node;
-
- /* find closest start bno overlap */
while (rbp) {
busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
if (bno < busyp->bno) {
@@ -115,7 +108,6 @@ xfs_extent_busy_search(
}
}
spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
return match;
}
@@ -281,17 +273,14 @@ out_force_log:
void
xfs_extent_busy_reuse(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_agblock_t fbno,
xfs_extlen_t flen,
bool userdata)
{
- struct xfs_perag *pag;
struct rb_node *rbp;
ASSERT(flen > 0);
-
- pag = xfs_perag_get(mp, agno);
spin_lock(&pag->pagb_lock);
restart:
rbp = pag->pagb_tree.rb_node;
@@ -314,7 +303,6 @@ restart:
goto restart;
}
spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
}
/*
@@ -344,7 +332,6 @@ xfs_extent_busy_trim(
ASSERT(*len > 0);
spin_lock(&args->pag->pagb_lock);
-restart:
fbno = *bno;
flen = *len;
rbp = args->pag->pagb_tree.rb_node;
@@ -363,19 +350,6 @@ restart:
continue;
}
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!(args->datatype & XFS_ALLOC_USERDATA) &&
- !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
- if (!xfs_extent_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
if (bbno <= fbno) {
/* start overlap */
@@ -619,12 +593,11 @@ void
xfs_extent_busy_wait_all(
struct xfs_mount *mp)
{
+ struct xfs_perag *pag;
DEFINE_WAIT (wait);
xfs_agnumber_t agno;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- struct xfs_perag *pag = xfs_perag_get(mp, agno);
-
+ for_each_perag(mp, agno, pag) {
do {
prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
if (RB_EMPTY_ROOT(&pag->pagb_tree))
@@ -632,8 +605,6 @@ xfs_extent_busy_wait_all(
schedule();
} while (1);
finish_wait(&pag->pagb_wait, &wait);
-
- xfs_perag_put(pag);
}
}
@@ -643,8 +614,8 @@ xfs_extent_busy_wait_all(
int
xfs_extent_busy_ag_cmp(
void *priv,
- struct list_head *l1,
- struct list_head *l2)
+ const struct list_head *l1,
+ const struct list_head *l2)
{
struct xfs_extent_busy *b1 =
container_of(l1, struct xfs_extent_busy, list);
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 990ab3891971..4a118131059f 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -9,6 +9,7 @@
#define __XFS_EXTENT_BUSY_H__
struct xfs_mount;
+struct xfs_perag;
struct xfs_trans;
struct xfs_alloc_arg;
@@ -31,7 +32,7 @@ struct xfs_extent_busy {
};
void
-xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
@@ -39,11 +40,11 @@ xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
bool do_discard);
int
-xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len);
void
-xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag,
xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
bool
@@ -58,7 +59,8 @@ void
xfs_extent_busy_wait_all(struct xfs_mount *mp);
int
-xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b);
static inline void xfs_extent_busy_sort(struct list_head *list)
{
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6ea847f6e298..d5130d1fcfae 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -11,6 +11,7 @@
#include "xfs_bit.h"
#include "xfs_shared.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -22,16 +23,20 @@
#include "xfs_bmap.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
-kmem_zone_t *xfs_efi_zone;
-kmem_zone_t *xfs_efd_zone;
+struct kmem_cache *xfs_efi_cache;
+struct kmem_cache *xfs_efd_cache;
+
+static const struct xfs_item_ops xfs_efi_item_ops;
static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_efi_log_item, efi_item);
}
-void
+STATIC void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
@@ -39,7 +44,7 @@ xfs_efi_item_free(
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
- kmem_cache_free(xfs_efi_zone, efip);
+ kmem_cache_free(xfs_efi_cache, efip);
}
/*
@@ -49,28 +54,16 @@ xfs_efi_item_free(
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the EFI.
*/
-void
+STATIC void
xfs_efi_release(
struct xfs_efi_log_item *efip)
{
ASSERT(atomic_read(&efip->efi_refcount) > 0);
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
-}
+ if (!atomic_dec_and_test(&efip->efi_refcount))
+ return;
-/*
- * This returns the number of iovecs needed to log the given efi item.
- * We only need 1 iovec for an efi item. It just logs the efi_log_format
- * structure.
- */
-static inline int
-xfs_efi_item_sizeof(
- struct xfs_efi_log_item *efip)
-{
- return sizeof(struct xfs_efi_log_format) +
- (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+ xfs_trans_ail_delete(&efip->efi_item, 0);
+ xfs_efi_item_free(efip);
}
STATIC void
@@ -79,8 +72,10 @@ xfs_efi_item_size(
int *nvecs,
int *nbytes)
{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
+ *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
/*
@@ -106,7 +101,7 @@ xfs_efi_item_format(
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
&efip->efi_format,
- xfs_efi_item_sizeof(efip));
+ xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents));
}
@@ -139,33 +134,24 @@ xfs_efi_item_release(
xfs_efi_release(EFI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_efi_item_ops = {
- .iop_size = xfs_efi_item_size,
- .iop_format = xfs_efi_item_format,
- .iop_unpin = xfs_efi_item_unpin,
- .iop_release = xfs_efi_item_release,
-};
-
-
/*
* Allocate and initialize an efi item with the given number of extents.
*/
-struct xfs_efi_log_item *
+STATIC struct xfs_efi_log_item *
xfs_efi_init(
struct xfs_mount *mp,
uint nextents)
{
struct xfs_efi_log_item *efip;
- uint size;
ASSERT(nextents > 0);
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(xfs_efi_log_item_t) +
- ((nextents - 1) * sizeof(xfs_extent_t)));
- efip = kmem_zalloc(size, 0);
+ efip = kzalloc(xfs_efi_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
} else {
- efip = kmem_zone_zalloc(xfs_efi_zone, 0);
+ efip = kmem_cache_zalloc(xfs_efi_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
}
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -184,20 +170,22 @@ xfs_efi_init(
* one of which will be the native format for this kernel.
* It will handle the conversion of formats if necessary.
*/
-int
+STATIC int
xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
uint i;
- uint len = sizeof(xfs_efi_log_format_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
- uint len32 = sizeof(xfs_efi_log_format_32_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
- uint len64 = sizeof(xfs_efi_log_format_64_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+ uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+ uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+ uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
if (buf->i_len == len) {
- memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+ memcpy(dst_efi_fmt, src_efi_fmt,
+ offsetof(struct xfs_efi_log_format, efi_extents));
+ for (i = 0; i < src_efi_fmt->efi_nextents; i++)
+ memcpy(&dst_efi_fmt->efi_extents[i],
+ &src_efi_fmt->efi_extents[i],
+ sizeof(struct xfs_extent));
return 0;
} else if (buf->i_len == len32) {
xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
@@ -228,7 +216,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
+ buf->i_len);
return -EFSCORRUPTED;
}
@@ -244,20 +233,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
- kmem_cache_free(xfs_efd_zone, efdp);
-}
-
-/*
- * This returns the number of iovecs needed to log the given efd item.
- * We only need 1 iovec for an efd item. It just logs the efd_log_format
- * structure.
- */
-static inline int
-xfs_efd_item_sizeof(
- struct xfs_efd_log_item *efdp)
-{
- return sizeof(xfs_efd_log_format_t) +
- (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+ kmem_cache_free(xfs_efd_cache, efdp);
}
STATIC void
@@ -266,8 +242,10 @@ xfs_efd_item_size(
int *nvecs,
int *nbytes)
{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
*nvecs += 1;
- *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
+ *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
/*
@@ -292,7 +270,7 @@ xfs_efd_item_format(
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
&efdp->efd_format,
- xfs_efd_item_sizeof(efdp));
+ xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents));
}
/*
@@ -309,11 +287,20 @@ xfs_efd_item_release(
xfs_efd_item_free(efdp);
}
+static struct xfs_log_item *
+xfs_efd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efd_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_release = xfs_efd_item_release,
+ .iop_intent = xfs_efd_item_intent,
};
/*
@@ -332,11 +319,11 @@ xfs_trans_get_efd(
ASSERT(nextents > 0);
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
- (nextents - 1) * sizeof(struct xfs_extent),
- 0);
+ efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
+ GFP_KERNEL | __GFP_NOFAIL);
} else {
- efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
}
xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
@@ -382,7 +369,7 @@ xfs_trans_free_extent(
* 1.) releases the EFI and frees the EFD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
@@ -399,8 +386,8 @@ xfs_trans_free_extent(
static int
xfs_extent_free_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_extent_free_item *ra;
@@ -412,41 +399,16 @@ xfs_extent_free_diff_items(
XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
}
-/* Get an EFI. */
-STATIC void *
-xfs_extent_free_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_efi_log_item *efip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- efip = xfs_efi_init(tp->t_mountp, count);
- ASSERT(efip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efip->efi_item);
- return efip;
-}
-
/* Log a free extent to the intent item. */
STATIC void
xfs_extent_free_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_efi_log_item *efip,
+ struct xfs_extent_free_item *free)
{
- struct xfs_efi_log_item *efip = intent;
- struct xfs_extent_free_item *free;
uint next_extent;
struct xfs_extent *extp;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
@@ -462,42 +424,69 @@ xfs_extent_free_log_item(
extp->ext_len = free->xefi_blockcount;
}
+static struct xfs_log_item *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
+ struct xfs_extent_free_item *free;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &efip->efi_item);
+ if (sort)
+ list_sort(mp, items, xfs_extent_free_diff_items);
+ list_for_each_entry(free, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, free);
+ return &efip->efi_item;
+}
+
/* Get an EFD so we can process all the free extents. */
-STATIC void *
+static struct xfs_log_item *
xfs_extent_free_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_efd(tp, intent, count);
+ return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
}
/* Process a free extent. */
STATIC int
xfs_extent_free_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *free;
int error;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- error = xfs_trans_free_extent(tp, done_item,
+ oinfo.oi_owner = free->xefi_owner;
+ if (free->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+ error = xfs_trans_free_extent(tp, EFD_ITEM(done),
free->xefi_startblock,
free->xefi_blockcount,
- &free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
+ &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
/* Abort all pending EFIs. */
STATIC void
xfs_extent_free_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_efi_release(intent);
+ xfs_efi_release(EFI_ITEM(intent));
}
/* Cancel a free extent. */
@@ -508,15 +497,13 @@ xfs_extent_free_cancel_item(
struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
.abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_extent_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
@@ -529,12 +516,13 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
STATIC int
xfs_agfl_free_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_efd_log_item *efdp = done_item;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done);
struct xfs_extent_free_item *free;
struct xfs_extent *extp;
struct xfs_buf *agbp;
@@ -542,18 +530,21 @@ xfs_agfl_free_finish_item(
xfs_agnumber_t agno;
xfs_agblock_t agbno;
uint next_extent;
+ struct xfs_perag *pag;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
ASSERT(free->xefi_blockcount == 1);
agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+ oinfo.oi_owner = free->xefi_owner;
trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp,
- &free->xefi_oinfo);
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
+ xfs_perag_put(pag);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -572,39 +563,45 @@ xfs_agfl_free_finish_item(
extp->ext_len = free->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_free(free);
+ kmem_cache_free(xfs_extfree_item_cache, free);
return error;
}
/* sub-type with special handling for AGFL deferred frees */
const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
.abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_agfl_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
};
+/* Is this recovered EFI ok? */
+static inline bool
+xfs_efi_validate_ext(
+ struct xfs_mount *mp,
+ struct xfs_extent *extp)
+{
+ return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
+}
+
/*
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
-int
-xfs_efi_recover(
- struct xfs_mount *mp,
- struct xfs_efi_log_item *efip)
+STATIC int
+xfs_efi_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- struct xfs_efd_log_item *efdp;
- struct xfs_trans *tp;
- int i;
- int error = 0;
- xfs_extent_t *extp;
- xfs_fsblock_t startblock_fsb;
-
- ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_efd_log_item *efdp;
+ struct xfs_trans *tp;
+ struct xfs_extent *extp;
+ int i;
+ int error = 0;
/*
* First check the validity of the extents described by the
@@ -612,19 +609,11 @@ xfs_efi_recover(
* just toss the EFI.
*/
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- extp = &efip->efi_format.efi_extents[i];
- startblock_fsb = XFS_BB_TO_FSB(mp,
- XFS_FSB_TO_DADDR(mp, extp->ext_start));
- if (startblock_fsb == 0 ||
- extp->ext_len == 0 ||
- startblock_fsb >= mp->m_sb.sb_dblocks ||
- extp->ext_len >= mp->m_sb.sb_agblocks) {
- /*
- * This will pull the EFI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- xfs_efi_release(efip);
+ if (!xfs_efi_validate_ext(mp,
+ &efip->efi_format.efi_extents[i])) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &efip->efi_format,
+ sizeof(efip->efi_format));
return -EFSCORRUPTED;
}
}
@@ -639,16 +628,155 @@ xfs_efi_recover(
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
extp->ext_len,
&XFS_RMAP_OINFO_ANY_OWNER, false);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ extp, sizeof(*extp));
if (error)
goto abort_error;
}
- set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_trans_cancel(tp);
return error;
}
+
+STATIC bool
+xfs_efi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_efi_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_efd_log_item *efdp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_extent *extp;
+ unsigned int count;
+
+ count = EFI_ITEM(intent)->efi_format.efi_nextents;
+ extp = EFI_ITEM(intent)->efi_format.efi_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ efdp->efd_next_extent = count;
+ memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
+ atomic_set(&efip->efi_next_extent, count);
+ xfs_trans_add_item(tp, &efip->efi_item);
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+ return &efip->efi_item;
+}
+
+static const struct xfs_item_ops xfs_efi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_efi_item_size,
+ .iop_format = xfs_efi_item_format,
+ .iop_unpin = xfs_efi_item_unpin,
+ .iop_release = xfs_efi_item_release,
+ .iop_recover = xfs_efi_item_recover,
+ .iop_match = xfs_efi_item_match,
+ .iop_relog = xfs_efi_item_relog,
+};
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_efi_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_efi_log_format *efi_formatp;
+ int error;
+
+ efi_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+ error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+ if (error) {
+ xfs_efi_item_free(efip);
+ return error;
+ }
+ atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
+ xfs_efi_release(efip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_efi_item_ops = {
+ .item_type = XFS_LI_EFI,
+ .commit_pass2 = xlog_recover_efi_commit_pass2,
+};
+
+/*
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_efd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_efd_log_format *efd_formatp;
+ int buflen = item->ri_buf[0].i_len;
+
+ efd_formatp = item->ri_buf[0].i_addr;
+
+ if (buflen < sizeof(struct xfs_efd_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
+
+ if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ efd_formatp->efd_nextents) &&
+ item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ efd_formatp->efd_nextents)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_efd_item_ops = {
+ .item_type = XFS_LI_EFD,
+ .commit_pass2 = xlog_recover_efd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 16aaab06d4ec..da6a5afa607c 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -9,7 +9,7 @@
/* kernel only EFI/EFD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -17,11 +17,6 @@ struct kmem_zone;
#define XFS_EFI_MAX_FAST_EXTENTS 16
/*
- * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_EFI_RECOVERED 1
-
-/*
* This is the "extent free intention" log item. It is used to log the fact
* that some extents need to be free. It is used in conjunction with the
* "extent free done" log item described below.
@@ -50,41 +45,47 @@ struct kmem_zone;
* of commit failure or log I/O errors. Note that the EFD is not inserted in the
* AIL, so at this point both the EFI and EFD are freed.
*/
-typedef struct xfs_efi_log_item {
+struct xfs_efi_log_item {
struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
- unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
-} xfs_efi_log_item_t;
+};
+
+static inline size_t
+xfs_efi_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_efi_log_item, efi_format) +
+ xfs_efi_log_format_sizeof(nr);
+}
/*
* This is the "extent free done" log item. It is used to log
* the fact that some extents earlier mentioned in an efi item
* have been freed.
*/
-typedef struct xfs_efd_log_item {
+struct xfs_efd_log_item {
struct xfs_log_item efd_item;
- xfs_efi_log_item_t *efd_efip;
+ struct xfs_efi_log_item *efd_efip;
uint efd_next_extent;
xfs_efd_log_format_t efd_format;
-} xfs_efd_log_item_t;
+};
+
+static inline size_t
+xfs_efd_log_item_sizeof(
+ unsigned int nr)
+{
+ return offsetof(struct xfs_efd_log_item, efd_format) +
+ xfs_efd_log_format_sizeof(nr);
+}
/*
* Max number of extents in fast allocation path.
*/
#define XFS_EFD_MAX_FAST_EXTENTS 16
-extern struct kmem_zone *xfs_efi_zone;
-extern struct kmem_zone *xfs_efd_zone;
-
-xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
-int xfs_efi_copy_format(xfs_log_iovec_t *buf,
- xfs_efi_log_format_t *dst_efi_fmt);
-void xfs_efi_item_free(xfs_efi_log_item_t *);
-void xfs_efi_release(struct xfs_efi_log_item *);
-
-int xfs_efi_recover(struct xfs_mount *mp,
- struct xfs_efi_log_item *efip);
+extern struct kmem_cache *xfs_efi_cache;
+extern struct kmem_cache *xfs_efd_cache;
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b8a4a3f29b36..e462d39c840e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,45 +25,46 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include <linux/dax.h>
#include <linux/falloc.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
#include <linux/fadvise.h>
+#include <linux/mount.h>
static const struct vm_operations_struct xfs_file_vm_ops;
-int
-xfs_update_prealloc_flags(
+/*
+ * Decide if the given file range is aligned to the size of the fundamental
+ * allocation unit for the file.
+ */
+static bool
+xfs_is_falloc_aligned(
struct xfs_inode *ip,
- enum xfs_prealloc_flags flags)
+ loff_t pos,
+ long long int len)
{
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
- 0, 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- if (!(flags & XFS_PREALLOC_INVISIBLE)) {
- VFS_I(ip)->i_mode &= ~S_ISUID;
- if (VFS_I(ip)->i_mode & S_IXGRP)
- VFS_I(ip)->i_mode &= ~S_ISGID;
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ struct xfs_mount *mp = ip->i_mount;
+ uint64_t mask;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
+ u64 rextbytes;
+ u32 mod;
+
+ rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ div_u64_rem(pos, rextbytes, &mod);
+ if (mod)
+ return false;
+ div_u64_rem(len, rextbytes, &mod);
+ return mod == 0;
+ }
+ mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
+ } else {
+ mask = mp->m_sb.sb_blocksize - 1;
}
- if (flags & XFS_PREALLOC_SET)
- ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
- if (flags & XFS_PREALLOC_CLEAR)
- ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (flags & XFS_PREALLOC_SYNC)
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
+ return !((pos | len) & mask);
}
/*
@@ -80,19 +81,57 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
+ return xfs_log_force_inode(ip);
+}
+
+static xfs_csn_t
+xfs_fsync_seq(
+ struct xfs_inode *ip,
+ bool datasync)
+{
+ if (!xfs_ipincount(ip))
+ return 0;
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ return 0;
+ return ip->i_itemp->ili_commit_seq;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk. We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ int error = 0;
+ xfs_csn_t seq;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ seq = xfs_fsync_seq(ip, datasync);
+ if (seq) {
+ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ spin_lock(&ip->i_itemp->ili_lock);
+ ip->i_itemp->ili_fsync_fields = 0;
+ spin_unlock(&ip->i_itemp->ili_lock);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
}
STATIC int
@@ -102,12 +141,10 @@ xfs_file_fsync(
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error = 0;
+ int error, err2;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
@@ -115,7 +152,7 @@ xfs_file_fsync(
if (error)
return error;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_iflags_clear(ip, XFS_ITRUNCATED);
@@ -127,36 +164,22 @@ xfs_file_fsync(
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
- * All metadata updates are logged, which means that we just have to
- * flush the log up to the latest LSN that touched the inode. If we have
- * concurrent fsync/fdatasync() calls, we need them to all block on the
- * log force before we clear the ili_fsync_fields field. This ensures
- * that we don't get a racing sync operation that does not wait for the
- * metadata to hit the journal before returning. If we race with
- * clearing the ili_fsync_fields, then all that will happen is the log
- * force will do nothing as the lsn will already be on disk. We can't
- * race with setting ili_fsync_fields because that is done under
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- * until after the ili_fsync_fields is cleared.
+ * Any inode that has dirty modifications in the log is pinned. The
+ * racy check here for a pinned inode will not catch modifications
+ * that happen concurrently to the fsync call, but fsync semantics
+ * only require to sync previously completed I/O.
*/
- xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
- if (!datasync ||
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = ip->i_itemp->ili_last_lsn;
+ err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (err2 && !error)
+ error = err2;
}
- if (lsn) {
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- ip->i_itemp->ili_fsync_fields = 0;
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
/*
* If we only have a single device, and the log force about was
* a no-op we might have to flush the data device cache here.
@@ -165,36 +188,51 @@ xfs_file_fsync(
* commit.
*/
if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
- mp->m_logdev_targp == mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ mp->m_logdev_targp == mp->m_ddev_targp) {
+ err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ if (err2 && !error)
+ error = err2;
+ }
return error;
}
+static int
+xfs_ilock_iocb(
+ struct kiocb *iocb,
+ unsigned int lock_mode)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, lock_mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lock_mode);
+ }
+
+ return 0;
+}
+
STATIC ssize_t
-xfs_file_dio_aio_read(
+xfs_file_dio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- size_t count = iov_iter_count(to);
ssize_t ret;
- trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_direct_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
file_accessed(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -206,21 +244,16 @@ xfs_file_dax_read(
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- size_t count = iov_iter_count(to);
ssize_t ret = 0;
- trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_dax_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -229,21 +262,18 @@ xfs_file_dax_read(
}
STATIC ssize_t
-xfs_file_buffered_aio_read(
+xfs_file_buffered_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret;
- trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ trace_xfs_file_buffered_read(iocb, to);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -261,15 +291,15 @@ xfs_file_read_iter(
XFS_STATS_INC(mp, xs_read_calls);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
- ret = xfs_file_dio_aio_read(iocb, to);
+ ret = xfs_file_dio_read(iocb, to);
else
- ret = xfs_file_buffered_aio_read(iocb, to);
+ ret = xfs_file_buffered_read(iocb, to);
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@ -284,10 +314,10 @@ xfs_file_read_iter(
* if called for a direct write beyond i_size.
*/
STATIC ssize_t
-xfs_file_aio_write_checks(
+xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- int *iolock)
+ unsigned int *iolock)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -302,7 +332,14 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ error = break_layout(inode, false);
+ if (error == -EWOULDBLOCK)
+ error = -EAGAIN;
+ } else {
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ }
+
if (error)
return error;
@@ -313,28 +350,45 @@ restart:
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
+ error = xfs_ilock_iocb(iocb, *iolock);
+ if (error) {
+ *iolock = 0;
+ return error;
+ }
goto restart;
}
+
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the
- * iolock shared, we need to update it to exclusive which implies
- * having to redo all checks before.
+ * write. If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies having to
+ * redo all checks before.
+ *
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while we
+ * do this check until we have placed an IO barrier (i.e. hold the
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have the
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
*
- * We need to serialise against EOF updates that occur in IO
- * completions here. We want to make sure that nobody is changing the
- * size while we do this check until we have placed an IO barrier (i.e.
- * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
- * The spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
- * and hence be able to correctly determine if we need to run zeroing.
+ * We can do an unlocked check here safely as IO completion can only
+ * extend EOF. Truncate is locked out at this point, so the EOF can
+ * not move backwards, only forwards. Hence we only need to take the
+ * slow path and spin locks when we are at or beyond the current EOF.
*/
+ if (iocb->ki_pos <= i_size_read(inode))
+ goto out;
+
spin_lock(&ip->i_flags_lock);
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
@@ -354,22 +408,16 @@ restart:
drained_dio = true;
goto restart;
}
-
+
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
- NULL, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
} else
spin_unlock(&ip->i_flags_lock);
- /*
- * Updating the timestamps will grab the ilock again from
- * xfs_fs_dirty_inode, so we have to call it after dropping the
- * lock above. Eventually we should look into a way to avoid
- * the pointless lock roundtrip.
- */
- return file_modified(file);
+out:
+ return kiocb_modified(iocb);
}
static int
@@ -386,7 +434,7 @@ xfs_dio_write_end_io(
trace_xfs_end_io_direct_write(ip, offset, size);
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (error)
@@ -434,7 +482,17 @@ xfs_dio_write_end_io(
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
+ *
+ * As IO completion only ever extends EOF, we can do an unlocked check
+ * here to avoid taking the spinlock. If we land within the current EOF,
+ * then we do not need to do an extending update at all, and we don't
+ * need to take the lock to check this. If we race with an update moving
+ * EOF, then we'll either still be beyond EOF and need to take the lock,
+ * or we'll be within EOF and we don't need to take it at all.
*/
+ if (offset + size <= i_size_read(inode))
+ goto out;
+
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode)) {
i_size_write(inode, offset + size);
@@ -454,122 +512,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
};
/*
- * xfs_file_dio_aio_write - handle direct IO writes
- *
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
- *
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer. To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * Handle block aligned direct I/O writes
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+ struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret;
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- return -EINVAL;
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
/*
- * Don't take the exclusive iolock here unless the I/O is unaligned to
- * the file system block size. We don't need to consider the EOF
- * extension case here because xfs_file_aio_write_checks() will relock
- * the inode as necessary for EOF zeroing cases and fill out the new
- * inode size as appropriate.
+ * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ * the iolock back to shared if we had to take the exclusive lock in
+ * xfs_file_write_checks() for other reasons.
*/
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask)) {
- unaligned_io = 1;
-
- /*
- * We can't properly handle unaligned direct I/O to reflink
- * files yet, as we can't unshare a partial block.
- */
- if (xfs_is_cow_inode(ip)) {
- trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- return -EREMCHG;
- }
- iolock = XFS_IOLOCK_EXCL;
- } else {
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops, 0, NULL, 0);
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* unaligned dio always waits, bail */
- if (unaligned_io)
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, iolock))
+/*
+ * Handle block unaligned direct I/O writes
+ *
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes. However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block. In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
+ *
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required. In that case
+ * we try again with the exclusive lock.
+ */
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
+ ssize_t ret;
+
+ /*
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF, so
+ * don't even bother trying the fast path in this case.
+ */
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
+retry_exclusive:
+ iolock = XFS_IOLOCK_EXCL;
+ flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
- goto out;
- count = iov_iter_count(from);
+ return ret;
/*
- * If we are doing unaligned IO, we can't allow any other overlapping IO
- * in-flight at the same time or we risk data corruption. Wait for all
- * other IO to drain before we submit. If the IO is aligned, demote the
- * iolock if we had to take the exclusive lock in
- * xfs_file_aio_write_checks() for other reasons.
+ * We can't properly handle unaligned direct I/O to reflink files yet,
+ * as we can't unshare a partial block.
*/
- if (unaligned_io) {
- inode_dio_wait(inode);
- } else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ if (xfs_is_cow_inode(ip)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
+
/*
- * If unaligned, this is the only IO in-flight. Wait on it before we
- * release the iolock to prevent subsequent overlapping IO.
+ * If we are doing exclusive unaligned I/O, this must be the only I/O
+ * in-flight. Otherwise we risk data corruption due to unwritten extent
+ * conversions from the AIO end_io handler. Wait for all other I/O to
+ * drain first.
*/
+ if (flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io);
-out:
- xfs_iunlock(ip, iolock);
+ &xfs_dio_write_ops, flags, NULL, 0);
/*
- * No fallback to buffered IO on errors for XFS, direct IO will either
- * complete fully or fail.
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user I/O, propagate the error.
*/
- ASSERT(ret < 0 || ret == count);
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
return ret;
}
+static ssize_t
+xfs_file_dio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ size_t count = iov_iter_count(from);
+
+ /* direct I/O must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ return -EINVAL;
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ return xfs_file_dio_write_unaligned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
static noinline ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
@@ -577,33 +662,28 @@ xfs_file_dax_write(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
- size_t count;
loff_t pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
- }
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
pos = iocb->ki_pos;
- count = iov_iter_count(from);
- trace_xfs_file_dax_write(ip, count, pos);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+ trace_xfs_file_dax_write(iocb, from);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
out:
- xfs_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
if (error)
return error;
@@ -617,33 +697,30 @@ out:
}
STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
- int iolock;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
+ bool cleared_space = false;
+ unsigned int iolock;
write_retry:
iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
@@ -656,27 +733,23 @@ write_retry:
* metadata space. This reduces the chances that the eofblocks scan
* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
* also behaves as a filter to prevent too many eofblocks scans from
- * running at the same time.
+ * running at the same time. Use a synchronous scan to increase the
+ * effectiveness of the scan.
*/
- if (ret == -EDQUOT && !enospc) {
+ if (ret == -EDQUOT && !cleared_space) {
xfs_iunlock(ip, iolock);
- enospc = xfs_inode_free_quota_eofblocks(ip);
- if (enospc)
- goto write_retry;
- enospc = xfs_inode_free_quota_cowblocks(ip);
- if (enospc)
- goto write_retry;
- iolock = 0;
- } else if (ret == -ENOSPC && !enospc) {
- struct xfs_eofblocks eofb = {0};
-
- enospc = 1;
+ xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
+ cleared_space = true;
+ goto write_retry;
+ } else if (ret == -ENOSPC && !cleared_space) {
+ struct xfs_icwalk icw = {0};
+
+ cleared_space = true;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
- eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
- xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
+ xfs_blockgc_free_space(ip->i_mount, &icw);
goto write_retry;
}
@@ -698,9 +771,7 @@ xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
@@ -710,7 +781,7 @@ xfs_file_write_iter(
if (ocount == 0)
return 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (xfs_is_shutdown(ip->i_mount))
return -EIO;
if (IS_DAX(inode))
@@ -723,12 +794,12 @@ xfs_file_write_iter(
* CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode.
*/
- ret = xfs_file_dio_aio_write(iocb, from);
- if (ret != -EREMCHG)
+ ret = xfs_file_dio_write(iocb, from);
+ if (ret != -ENOTBLK)
return ret;
}
- return xfs_file_buffered_aio_write(iocb, from);
+ return xfs_file_buffered_write(iocb, from);
}
static void
@@ -742,7 +813,7 @@ xfs_wait_dax_page(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
-static int
+int
xfs_break_dax_layouts(
struct inode *inode,
bool *retry)
@@ -779,7 +850,7 @@ xfs_break_layouts(
error = xfs_break_dax_layouts(inode, &retry);
if (error || retry)
break;
- /* fall through */
+ fallthrough;
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, &retry);
break;
@@ -792,6 +863,21 @@ xfs_break_layouts(
return error;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (xfs_has_wsync(ip->i_mount))
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@@ -807,7 +893,6 @@ xfs_file_fallocate(
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
- enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;
@@ -852,14 +937,16 @@ xfs_file_fallocate(
goto out_unlock;
}
+ error = file_modified(file);
+ if (error)
+ goto out_unlock;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
-
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -879,10 +966,9 @@ xfs_file_fallocate(
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_INSERT_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
loff_t isize = i_size_read(inode);
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -904,8 +990,6 @@ xfs_file_fallocate(
}
do_file_insert = true;
} else {
- flags |= XFS_PREALLOC_SET;
-
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
@@ -951,27 +1035,20 @@ xfs_file_fallocate(
}
if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len,
- XFS_BMAPI_PREALLOC);
+ error = xfs_alloc_file_space(ip, offset, len);
if (error)
goto out_unlock;
}
}
- if (file->f_flags & O_DSYNC)
- flags |= XFS_PREALLOC_SYNC;
-
- error = xfs_update_prealloc_flags(ip, flags);
- if (error)
- goto out_unlock;
-
/* Change file size if needed */
if (new_size) {
struct iattr iattr;
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_dentry(file), &iattr);
+ error = xfs_vn_setattr_size(file_mnt_user_ns(file),
+ file_dentry(file), &iattr);
if (error)
goto out_unlock;
}
@@ -982,8 +1059,14 @@ xfs_file_fallocate(
* leave shifted extents past EOF and hence losing access to
* the data that is contained within them.
*/
- if (do_file_insert)
+ if (do_file_insert) {
error = xfs_insert_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_file_sync_writes(file))
+ error = xfs_log_force_inode(ip);
out_unlock:
xfs_iunlock(ip, iolock);
@@ -1036,16 +1119,16 @@ xfs_file_remap_range(
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return -EOPNOTSUPP;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
- if (ret < 0 || len == 0)
+ if (ret || len == 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
@@ -1062,16 +1145,20 @@ xfs_file_remap_range(
*/
cowextsize = 0;
if (pos_in == 0 && len == i_size_read(inode_in) &&
- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
pos_out == 0 && len >= i_size_read(inode_out) &&
- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
- cowextsize = src->i_d.di_cowextsize;
+ !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_cowextsize;
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
+ xfs_log_force_inode(dest);
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
@@ -1082,12 +1169,10 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
- if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT;
- return 0;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -1096,7 +1181,7 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
error = xfs_file_open(inode, file);
@@ -1108,7 +1193,7 @@ xfs_dir_open(
* certain to have the next operation be a read there.
*/
mode = xfs_ilock_data_map_shared(ip);
- if (ip->i_d.di_nextents > 0)
+ if (ip->i_df.if_nextents > 0)
error = xfs_dir3_data_readahead(ip, 0, 0);
xfs_iunlock(ip, mode);
return error;
@@ -1143,7 +1228,7 @@ xfs_file_readdir(
* point we can change the ->readdir prototype to include the
* buffer size. For now we use the current glibc buffer size.
*/
- bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
return xfs_readdir(NULL, ip, ctx, bufsize);
}
@@ -1156,7 +1241,7 @@ xfs_file_llseek(
{
struct inode *inode = file->f_mapping->host;
- if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
+ if (xfs_is_shutdown(XFS_I(inode)->i_mount))
return -EIO;
switch (whence) {
@@ -1175,13 +1260,39 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
+#ifdef CONFIG_FS_DAX
+static inline vm_fault_t
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+}
+#else
+static inline vm_fault_t
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
+}
+#endif
+
/*
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
*
- * mmap_sem (MM)
+ * mmap_lock (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmaplock (XFS - truncate serialisation)
+ * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1202,30 +1313,38 @@ __xfs_filemap_fault(
file_update_time(vmf->vma->vm_file);
}
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
pfn_t pfn;
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
- (write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
- &xfs_read_iomap_ops);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
- if (write_fault)
+ if (write_fault) {
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
- else
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else {
ret = filemap_fault(vmf);
+ }
}
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;
}
+static inline bool
+xfs_is_write_fault(
+ struct vm_fault *vmf)
+{
+ return (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
+}
+
static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
@@ -1233,7 +1352,7 @@ xfs_filemap_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
IS_DAX(file_inode(vmf->vma->vm_file)) &&
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
@@ -1246,7 +1365,7 @@ xfs_filemap_huge_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, pe_size,
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
@@ -1269,10 +1388,25 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
+static vm_fault_t
+xfs_filemap_map_pages(
+ struct vm_fault *vmf,
+ pgoff_t start_pgoff,
+ pgoff_t end_pgoff)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = xfs_filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
@@ -1305,7 +1439,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
- .iopoll = iomap_dio_iopoll,
+ .iopoll = iocb_bio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 1a88025e68a3..34b21a29c39b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -9,13 +9,13 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
#include "xfs_trace.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_trans.h"
#include "xfs_filestream.h"
@@ -33,39 +33,7 @@ enum xfs_fstrm_alloc {
/*
* Allocation group filestream associations are tracked with per-ag atomic
* counters. These counters allow xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation. When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all. The race condition this addresses is the following:
- *
- * - The work queue scheduler fires and pulls a filestream directory cache
- * element off the LRU end of the cache for deletion, then gets pre-empted.
- * - A growfs operation grabs the m_peraglock in write mode, flushes all the
- * remaining items from the cache and reallocates the mount point's per-ag
- * array, resetting all the counters to zero.
- * - The work queue thread resumes and calls the free function for the element
- * it started cleaning up earlier. In the process it decrements the
- * filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
+ * particular AG already has active filestreams associated with it.
*/
int
xfs_filestream_peek_ag(
@@ -158,13 +126,14 @@ xfs_filestream_pick_ag(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
- err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+ err = xfs_alloc_read_agf(pag, NULL, trylock, NULL);
if (err) {
- xfs_perag_put(pag);
- if (err != -EAGAIN)
+ if (err != -EAGAIN) {
+ xfs_perag_put(pag);
return err;
+ }
/* Couldn't lock the AGF, skip this AG. */
- continue;
+ goto next_ag;
}
}
@@ -212,7 +181,7 @@ next_ag:
if (ag != startag)
continue;
- /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+ /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */
if (trylock != 0) {
trylock = 0;
continue;
@@ -327,7 +296,7 @@ xfs_filestream_lookup_ag(
* Set the starting AG using the rotor for inode32, otherwise
* use the directory inode's AG.
*/
- if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ if (xfs_is_inode32(mp)) {
xfs_agnumber_t rotorstep = xfs_rotorstep;
startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
mp->m_agfrotor = (mp->m_agfrotor + 1) %
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 5cc7665e93c9..403226ebb80b 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -21,8 +21,8 @@ static inline int
xfs_inode_is_filestream(
struct xfs_inode *ip)
{
- return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
- (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
+ return xfs_has_filestreams(ip->i_mount) ||
+ (ip->i_diflags & XFS_DIFLAG_FILESTREAM);
}
#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 918456ca29e1..d8337274c74d 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -24,9 +24,10 @@
#include "xfs_refcount_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_rtalloc.h"
+#include "xfs_ag.h"
/* Convert an xfs_fsmap to an fsmap. */
-void
+static void
xfs_fsmap_from_internal(
struct fsmap *dest,
struct xfs_fsmap *src)
@@ -60,7 +61,7 @@ xfs_fsmap_to_internal(
static int
xfs_fsmap_owner_to_rmap(
struct xfs_rmap_irec *dest,
- struct xfs_fsmap *src)
+ const struct xfs_fsmap *src)
{
if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
dest->rm_owner = src->fmr_owner;
@@ -110,8 +111,8 @@ xfs_fsmap_owner_to_rmap(
/* Convert an rmapbt owner into an fsmap owner. */
static int
xfs_fsmap_owner_from_rmap(
- struct xfs_fsmap *dest,
- struct xfs_rmap_irec *src)
+ struct xfs_fsmap *dest,
+ const struct xfs_rmap_irec *src)
{
dest->fmr_flags = 0;
if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
@@ -155,13 +156,12 @@ xfs_fsmap_owner_from_rmap(
/* getfsmap query state */
struct xfs_getfsmap_info {
struct xfs_fsmap_head *head;
- xfs_fsmap_format_t formatter; /* formatting fn */
- void *format_arg; /* format buffer */
+ struct fsmap *fsmap_recs; /* mapping records */
struct xfs_buf *agf_bp; /* AGF, for refcount queries */
+ struct xfs_perag *pag; /* AG info, if applicable */
xfs_daddr_t next_daddr; /* next daddr we expect */
u64 missing_owner; /* owner of holes */
u32 dev; /* device id */
- xfs_agnumber_t agno; /* AG number, if applicable */
struct xfs_rmap_irec low; /* low rmap key */
struct xfs_rmap_irec high; /* high rmap key */
bool last; /* last extent? */
@@ -171,7 +171,7 @@ struct xfs_getfsmap_info {
struct xfs_getfsmap_dev {
u32 dev;
int (*fn)(struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info);
};
@@ -192,7 +192,7 @@ STATIC int
xfs_getfsmap_is_shared(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
bool *stat)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -202,16 +202,15 @@ xfs_getfsmap_is_shared(
int error;
*stat = false;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
- /* rt files will have agno set to NULLAGNUMBER */
- if (info->agno == NULLAGNUMBER)
+ /* rt files will have no perag structure */
+ if (!info->pag)
return 0;
/* Are there any shared blocks here? */
flen = 0;
- cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
- info->agno);
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag);
error = xfs_refcount_find_shared(cur, rec->rm_startblock,
rec->rm_blockcount, &fbno, &flen, false);
@@ -224,6 +223,20 @@ xfs_getfsmap_is_shared(
return 0;
}
+static inline void
+xfs_getfsmap_format(
+ struct xfs_mount *mp,
+ struct xfs_fsmap *xfm,
+ struct xfs_getfsmap_info *info)
+{
+ struct fsmap *rec;
+
+ trace_xfs_getfsmap_mapping(mp, xfm);
+
+ rec = &info->fsmap_recs[info->head->fmh_entries++];
+ xfs_fsmap_from_internal(rec, xfm);
+}
+
/*
* Format a reverse mapping for getfsmap, having translated rm_startblock
* into the appropriate daddr units.
@@ -232,7 +245,7 @@ STATIC int
xfs_getfsmap_helper(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
xfs_daddr_t rec_daddr)
{
struct xfs_fsmap fmr;
@@ -256,6 +269,9 @@ xfs_getfsmap_helper(
/* Are we just counting mappings? */
if (info->head->fmh_count == 0) {
+ if (info->head->fmh_entries == UINT_MAX)
+ return -ECANCELED;
+
if (rec_daddr > info->next_daddr)
info->head->fmh_entries++;
@@ -285,10 +301,7 @@ xfs_getfsmap_helper(
fmr.fmr_offset = 0;
fmr.fmr_length = rec_daddr - info->next_daddr;
fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
- error = info->formatter(&fmr, info->format_arg);
- if (error)
- return error;
- info->head->fmh_entries++;
+ xfs_getfsmap_format(mp, &fmr, info);
}
if (info->last)
@@ -298,7 +311,8 @@ xfs_getfsmap_helper(
if (info->head->fmh_entries >= info->head->fmh_count)
return -ECANCELED;
- trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
+ trace_xfs_fsmap_mapping(mp, info->dev,
+ info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec);
fmr.fmr_device = info->dev;
fmr.fmr_physical = rec_daddr;
@@ -320,11 +334,8 @@ xfs_getfsmap_helper(
if (shared)
fmr.fmr_flags |= FMR_OF_SHARED;
}
- error = info->formatter(&fmr, info->format_arg);
- if (error)
- return error;
- info->head->fmh_entries++;
+ xfs_getfsmap_format(mp, &fmr, info);
out:
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
@@ -336,7 +347,7 @@ out:
STATIC int
xfs_getfsmap_datadev_helper(
struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *rec,
+ const struct xfs_rmap_irec *rec,
void *priv)
{
struct xfs_mount *mp = cur->bc_mp;
@@ -344,7 +355,7 @@ xfs_getfsmap_datadev_helper(
xfs_fsblock_t fsb;
xfs_daddr_t rec_daddr;
- fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock);
rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
@@ -354,7 +365,7 @@ xfs_getfsmap_datadev_helper(
STATIC int
xfs_getfsmap_datadev_bnobt_helper(
struct xfs_btree_cur *cur,
- struct xfs_alloc_rec_incore *rec,
+ const struct xfs_alloc_rec_incore *rec,
void *priv)
{
struct xfs_mount *mp = cur->bc_mp;
@@ -362,7 +373,7 @@ xfs_getfsmap_datadev_bnobt_helper(
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
- rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno,
rec->ar_startblock);
irec.rm_startblock = rec->ar_startblock;
@@ -378,7 +389,7 @@ xfs_getfsmap_datadev_bnobt_helper(
static void
xfs_getfsmap_set_irec_flags(
struct xfs_rmap_irec *irec,
- struct xfs_fsmap *fmr)
+ const struct xfs_fsmap *fmr)
{
irec->rm_flags = 0;
if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
@@ -393,7 +404,7 @@ xfs_getfsmap_set_irec_flags(
STATIC int
xfs_getfsmap_logdev(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -419,8 +430,8 @@ xfs_getfsmap_logdev(
info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
info->missing_owner = XFS_FMR_OWN_FREE;
- trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+ trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
if (keys[0].fmr_physical > 0)
return 0;
@@ -439,11 +450,11 @@ xfs_getfsmap_logdev(
/* Transform a rtbitmap "record" into a fsmap */
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
+ const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_mount *mp = tp->t_mountp;
struct xfs_getfsmap_info *info = priv;
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
@@ -462,7 +473,7 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
STATIC int
__xfs_getfsmap_rtdev(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
int (*query_fn)(struct xfs_trans *,
struct xfs_getfsmap_info *),
struct xfs_getfsmap_info *info)
@@ -470,16 +481,14 @@ __xfs_getfsmap_rtdev(
struct xfs_mount *mp = tp->t_mountp;
xfs_fsblock_t start_fsb;
xfs_fsblock_t end_fsb;
- xfs_daddr_t eofs;
+ uint64_t eofs;
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- if (keys[1].fmr_physical >= eofs)
- keys[1].fmr_physical = eofs - 1;
start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
- end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical);
+ end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
/* Set up search keys */
info->low.rm_startblock = start_fsb;
@@ -498,8 +507,8 @@ __xfs_getfsmap_rtdev(
info->high.rm_blockcount = 0;
xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
- trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+ trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high);
return query_fn(tp, info);
}
@@ -512,27 +521,37 @@ xfs_getfsmap_rtdev_rtbitmap_query(
{
struct xfs_rtalloc_rec alow = { 0 };
struct xfs_rtalloc_rec ahigh = { 0 };
+ struct xfs_mount *mp = tp->t_mountp;
int error;
- xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ /*
+ * Set up query parameters to return free rtextents covering the range
+ * we want.
+ */
alow.ar_startext = info->low.rm_startblock;
ahigh.ar_startext = info->high.rm_startblock;
- do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize);
- if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize))
+ do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
+ if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
- error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
goto err;
- /* Report any gaps at the end of the rtbitmap */
+ /*
+ * Report any gaps at the end of the rtbitmap by simulating a null
+ * rmap starting at the block after the end of the query range.
+ */
info->last = true;
- error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
+
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
if (error)
goto err;
err:
- xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
return error;
}
@@ -540,7 +559,7 @@ err:
STATIC int
xfs_getfsmap_rtdev_rtbitmap(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
info->missing_owner = XFS_FMR_OWN_UNKNOWN;
@@ -553,7 +572,7 @@ xfs_getfsmap_rtdev_rtbitmap(
STATIC int
__xfs_getfsmap_datadev(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info,
int (*query_fn)(struct xfs_trans *,
struct xfs_getfsmap_info *,
@@ -562,21 +581,20 @@ __xfs_getfsmap_datadev(
void *priv)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
struct xfs_btree_cur *bt_cur = NULL;
xfs_fsblock_t start_fsb;
xfs_fsblock_t end_fsb;
xfs_agnumber_t start_ag;
xfs_agnumber_t end_ag;
- xfs_daddr_t eofs;
+ uint64_t eofs;
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- if (keys[1].fmr_physical >= eofs)
- keys[1].fmr_physical = eofs - 1;
start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
- end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical);
+ end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
/*
* Convert the fsmap low/high keys to AG based keys. Initialize
@@ -600,20 +618,20 @@ __xfs_getfsmap_datadev(
start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
- /* Query each AG */
- for (info->agno = start_ag; info->agno <= end_ag; info->agno++) {
+ for_each_perag_range(mp, start_ag, end_ag, pag) {
/*
* Set the AG high key from the fsmap high key if this
* is the last AG that we're querying.
*/
- if (info->agno == end_ag) {
+ info->pag = pag;
+ if (pag->pag_agno == end_ag) {
info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
end_fsb);
info->high.rm_offset = XFS_BB_TO_FSBT(mp,
keys[1].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
if (error)
- goto err;
+ break;
xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
}
@@ -624,38 +642,44 @@ __xfs_getfsmap_datadev(
info->agf_bp = NULL;
}
- error = xfs_alloc_read_agf(mp, tp, info->agno, 0,
- &info->agf_bp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp);
if (error)
- goto err;
+ break;
- trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, info->agno,
+ trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno,
+ &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno,
&info->high);
error = query_fn(tp, info, &bt_cur, priv);
if (error)
- goto err;
+ break;
/*
* Set the AG low key to the start of the AG prior to
* moving on to the next AG.
*/
- if (info->agno == start_ag) {
+ if (pag->pag_agno == start_ag) {
info->low.rm_startblock = 0;
info->low.rm_owner = 0;
info->low.rm_offset = 0;
info->low.rm_flags = 0;
}
- }
- /* Report any gap at the end of the AG */
- info->last = true;
- error = query_fn(tp, info, &bt_cur, priv);
- if (error)
- goto err;
+ /*
+ * If this is the last AG, report any gap at the end of it
+ * before we drop the reference to the perag when the loop
+ * terminates.
+ */
+ if (pag->pag_agno == end_ag) {
+ info->last = true;
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ break;
+ }
+ info->pag = NULL;
+ }
-err:
if (bt_cur)
xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
XFS_BTREE_NOERROR);
@@ -663,6 +687,13 @@ err:
xfs_trans_brelse(tp, info->agf_bp);
info->agf_bp = NULL;
}
+ if (info->pag) {
+ xfs_perag_put(info->pag);
+ info->pag = NULL;
+ } else if (pag) {
+ /* loop termination case */
+ xfs_perag_put(pag);
+ }
return error;
}
@@ -681,7 +712,7 @@ xfs_getfsmap_datadev_rmapbt_query(
/* Allocate cursor for this AG and query_range it. */
*curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->agno);
+ info->pag);
return xfs_rmap_query_range(*curpp, &info->low, &info->high,
xfs_getfsmap_datadev_helper, info);
}
@@ -690,7 +721,7 @@ xfs_getfsmap_datadev_rmapbt_query(
STATIC int
xfs_getfsmap_datadev_rmapbt(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
info->missing_owner = XFS_FMR_OWN_FREE;
@@ -714,7 +745,7 @@ xfs_getfsmap_datadev_bnobt_query(
/* Allocate cursor for this AG and query_range it. */
*curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->agno, XFS_BTNUM_BNO);
+ info->pag, XFS_BTNUM_BNO);
key->ar_startblock = info->low.rm_startblock;
key[1].ar_startblock = info->high.rm_startblock;
return xfs_alloc_query_range(*curpp, key, &key[1],
@@ -725,7 +756,7 @@ xfs_getfsmap_datadev_bnobt_query(
STATIC int
xfs_getfsmap_datadev_bnobt(
struct xfs_trans *tp,
- struct xfs_fsmap *keys,
+ const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
struct xfs_alloc_rec_incore akeys[2];
@@ -792,11 +823,11 @@ xfs_getfsmap_check_keys(
#endif /* CONFIG_XFS_RT */
/*
- * Get filesystem's extents as described in head, and format for
- * output. Calls formatter to fill the user's buffer until all
- * extents are mapped, until the passed-in head->fmh_count slots have
- * been filled, or until the formatter short-circuits the loop, if it
- * is tracking filled-in extents on its own.
+ * Get filesystem's extents as described in head, and format for output. Fills
+ * in the supplied records array until there are no more reverse mappings to
+ * return or head.fmh_entries == head.fmh_count. In the second case, this
+ * function returns -ECANCELED to indicate that more records would have been
+ * returned.
*
* Key to Confusion
* ----------------
@@ -816,8 +847,7 @@ int
xfs_getfsmap(
struct xfs_mount *mp,
struct xfs_fsmap_head *head,
- xfs_fsmap_format_t formatter,
- void *arg)
+ struct fsmap *fsmap_recs)
{
struct xfs_trans *tp = NULL;
struct xfs_fsmap dkeys[2]; /* per-dev keys */
@@ -833,8 +863,8 @@ xfs_getfsmap(
!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
return -EINVAL;
- use_rmap = capable(CAP_SYS_ADMIN) &&
- xfs_sb_version_hasrmapbt(&mp->m_sb);
+ use_rmap = xfs_has_rmapbt(mp) &&
+ has_capability_noaudit(current, CAP_SYS_ADMIN);
head->fmh_entries = 0;
/* Set up our device handlers. */
@@ -892,8 +922,7 @@ xfs_getfsmap(
info.next_daddr = head->fmh_keys[0].fmr_physical +
head->fmh_keys[0].fmr_length;
- info.formatter = formatter;
- info.format_arg = arg;
+ info.fsmap_recs = fsmap_recs;
info.head = head;
/* For each device we support... */
@@ -918,13 +947,18 @@ xfs_getfsmap(
if (handlers[i].dev > head->fmh_keys[0].fmr_device)
memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
+ /*
+ * Grab an empty transaction so that we can use its recursive
+ * buffer locking abilities to detect cycles in the rmapbt
+ * without deadlocking.
+ */
error = xfs_trans_alloc_empty(mp, &tp);
if (error)
break;
info.dev = handlers[i].dev;
info.last = false;
- info.agno = NULLAGNUMBER;
+ info.pag = NULL;
error = handlers[i].fn(tp, dkeys, &info);
if (error)
break;
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
index c6c57739b862..a0775788e7b1 100644
--- a/fs/xfs/xfs_fsmap.h
+++ b/fs/xfs/xfs_fsmap.h
@@ -27,13 +27,9 @@ struct xfs_fsmap_head {
struct xfs_fsmap fmh_keys[2]; /* low and high keys */
};
-void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
-/* fsmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
-
int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
- xfs_fsmap_format_t formatter, void *arg);
+ struct fsmap *out_recs);
#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3e61d0cc23f8..13851c0d640b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,105 +17,149 @@
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
+#include "xfs_trace.h"
+
+/*
+ * Write new AG headers to disk. Non-transactional, but need to be
+ * written and completed prior to the growfs transaction being logged.
+ * To do this, we use a delayed write buffer list and wait for
+ * submission and IO completion of the list as a whole. This allows the
+ * IO subsystem to merge all the AG headers in a single AG into a single
+ * IO and hide most of the latency of the IO from us.
+ *
+ * This also means that if we get an error whilst building the buffer
+ * list to write, we can cancel the entire list without having written
+ * anything.
+ */
+static int
+xfs_resizefs_init_new_ags(
+ struct xfs_trans *tp,
+ struct aghdr_init_data *id,
+ xfs_agnumber_t oagcount,
+ xfs_agnumber_t nagcount,
+ xfs_rfsblock_t delta,
+ struct xfs_perag *last_pag,
+ bool *lastag_extended)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta;
+ int error;
+
+ *lastag_extended = false;
+
+ INIT_LIST_HEAD(&id->buffer_list);
+ for (id->agno = nagcount - 1;
+ id->agno >= oagcount;
+ id->agno--, delta -= id->agsize) {
+
+ if (id->agno == nagcount - 1)
+ id->agsize = nb - (id->agno *
+ (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+ else
+ id->agsize = mp->m_sb.sb_agblocks;
+
+ error = xfs_ag_init_headers(mp, id);
+ if (error) {
+ xfs_buf_delwri_cancel(&id->buffer_list);
+ return error;
+ }
+ }
+
+ error = xfs_buf_delwri_submit(&id->buffer_list);
+ if (error)
+ return error;
+
+ if (delta) {
+ *lastag_extended = true;
+ error = xfs_ag_extend_space(last_pag, tp, delta);
+ }
+ return error;
+}
/*
* growfs operations
*/
static int
xfs_growfs_data_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_data_t *in) /* growfs data input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_data *in) /* growfs data input struct */
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
- xfs_rfsblock_t nb, nb_mod;
- xfs_rfsblock_t new;
+ xfs_rfsblock_t nb, nb_div, nb_mod;
+ int64_t delta;
+ bool lastag_extended;
xfs_agnumber_t oagcount;
- xfs_trans_t *tp;
+ struct xfs_trans *tp;
struct aghdr_init_data id = {};
+ struct xfs_perag *last_pag;
nb = in->newblocks;
- if (nb < mp->m_sb.sb_dblocks)
- return -EINVAL;
- if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
+ error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
+ if (error)
return error;
- error = xfs_buf_read_uncached(mp->m_ddev_targp,
+
+ if (nb > mp->m_sb.sb_dblocks) {
+ error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
- if (error)
- return error;
- xfs_buf_relse(bp);
+ if (error)
+ return error;
+ xfs_buf_relse(bp);
+ }
- new = nb; /* use new as a temporary here */
- nb_mod = do_div(new, mp->m_sb.sb_agblocks);
- nagcount = new + (nb_mod != 0);
+ nb_div = nb;
+ nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
+ nagcount = nb_div + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
- if (nb < mp->m_sb.sb_dblocks)
- return -EINVAL;
}
- new = nb - mp->m_sb.sb_dblocks;
- oagcount = mp->m_sb.sb_agcount;
+ delta = nb - mp->m_sb.sb_dblocks;
+ /*
+ * Reject filesystems with a single AG because they are not
+ * supported, and reject a shrink operation that would cause a
+ * filesystem to become unsupported.
+ */
+ if (delta < 0 && nagcount < 2)
+ return -EINVAL;
+ oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
- error = xfs_initialize_perag(mp, nagcount, &nagimax);
+ error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
if (error)
return error;
+ } else if (nagcount < oagcount) {
+ /* TODO: shrinking the entire AGs hasn't yet completed */
+ return -EINVAL;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
- XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Write new AG headers to disk. Non-transactional, but need to be
- * written and completed prior to the growfs transaction being logged.
- * To do this, we use a delayed write buffer list and wait for
- * submission and IO completion of the list as a whole. This allows the
- * IO subsystem to merge all the AG headers in a single AG into a single
- * IO and hide most of the latency of the IO from us.
- *
- * This also means that if we get an error whilst building the buffer
- * list to write, we can cancel the entire list without having written
- * anything.
- */
- INIT_LIST_HEAD(&id.buffer_list);
- for (id.agno = nagcount - 1;
- id.agno >= oagcount;
- id.agno--, new -= id.agsize) {
-
- if (id.agno == nagcount - 1)
- id.agsize = nb -
- (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
- else
- id.agsize = mp->m_sb.sb_agblocks;
+ last_pag = xfs_perag_get(mp, oagcount - 1);
+ if (delta > 0) {
+ error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
+ delta, last_pag, &lastag_extended);
+ } else {
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
+ "EXPERIMENTAL online shrink feature in use. Use at your own risk!");
- error = xfs_ag_init_headers(mp, &id);
- if (error) {
- xfs_buf_delwri_cancel(&id.buffer_list);
- goto out_trans_cancel;
- }
+ error = xfs_ag_shrink_space(last_pag, &tp, -delta);
}
- error = xfs_buf_delwri_submit(&id.buffer_list);
+ xfs_perag_put(last_pag);
if (error)
goto out_trans_cancel;
- xfs_trans_agblocks_delta(tp, id.nfree);
-
- /* If there are new blocks in the old last AG, extend it. */
- if (new) {
- error = xfs_ag_extend_space(mp, tp, &id, new);
- if (error)
- goto out_trans_cancel;
- }
-
/*
* Update changed superblock fields transactionally. These are not
* seen by the rest of the world until the transaction commit applies
@@ -123,11 +167,19 @@ xfs_growfs_data_private(
*/
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
- if (nb > mp->m_sb.sb_dblocks)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
- nb - mp->m_sb.sb_dblocks);
+ if (delta)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
if (id.nfree)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
+
+ /*
+ * Sync sb counters now to reflect the updated values. This is
+ * particularly important for shrink because the write verifier
+ * will fail if sb_fdblocks is ever larger than sb_dblocks.
+ */
+ if (xfs_has_lazysbcount(mp))
+ xfs_log_sb(tp);
+
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
if (error)
@@ -139,28 +191,29 @@ xfs_growfs_data_private(
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
- /*
- * If we expanded the last AG, free the per-AG reservation
- * so we can reinitialize it with the new size.
- */
- if (new) {
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, id.agno);
- error = xfs_ag_resv_free(pag);
- xfs_perag_put(pag);
- if (error)
- return error;
+ if (delta > 0) {
+ /*
+ * If we expanded the last AG, free the per-AG reservation
+ * so we can reinitialize it with the new size.
+ */
+ if (lastag_extended) {
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, id.agno);
+ error = xfs_ag_resv_free(pag);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+ /*
+ * Reserve AG metadata blocks. ENOSPC here does not mean there
+ * was a growfs failure, just that there still isn't space for
+ * new user data after the grow has been run.
+ */
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error == -ENOSPC)
+ error = 0;
}
-
- /*
- * Reserve AG metadata blocks. ENOSPC here does not mean there was a
- * growfs failure, just that there still isn't space for new user data
- * after the grow has been run.
- */
- error = xfs_fs_reserve_ag_blocks(mp);
- if (error == -ENOSPC)
- error = 0;
return error;
out_trans_cancel:
@@ -170,8 +223,8 @@ out_trans_cancel:
static int
xfs_growfs_log_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_log_t *in) /* growfs log input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_log *in) /* growfs log input struct */
{
xfs_extlen_t nb;
@@ -268,7 +321,7 @@ out_error:
int
xfs_growfs_log(
xfs_mount_t *mp,
- xfs_growfs_log_t *in)
+ struct xfs_growfs_log *in)
{
int error;
@@ -293,11 +346,8 @@ xfs_fs_counts(
cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
-
- spin_lock(&mp->m_sb_lock);
- cnt->freertx = mp->m_sb.sb_frextents;
- spin_unlock(&mp->m_sb_lock);
+ xfs_fdblocks_unavailable(mp);
+ cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*
@@ -376,46 +426,36 @@ xfs_reserve_blocks(
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space.
+ *
+ * The code below estimates how many blocks it can request from
+ * fdblocks to stash in the reserve pool. This is a classic TOCTOU
+ * race since fdblocks updates are not always coordinated via
+ * m_sb_lock. Set the reserve size even if there's not enough free
+ * space to fill it because mod_fdblocks will refill an undersized
+ * reserve when it can.
*/
- error = -ENOSPC;
- do {
- free = percpu_counter_sum(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
- if (free <= 0)
- break;
-
- delta = request - mp->m_resblks;
- lcounter = free - delta;
- if (lcounter < 0)
- /* We can't satisfy the request, just get what we can */
- fdblks_delta = free;
- else
- fdblks_delta = delta;
-
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp);
+ delta = request - mp->m_resblks;
+ mp->m_resblks = request;
+ if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
- * count or we'll get an ENOSPC. If we get a ENOSPC, it means
- * things changed while we were calculating fdblks_delta and so
- * we should try again to see if there is anything left to
- * reserve.
+ * count or we'll get an ENOSPC. Don't set the reserved flag
+ * here - we don't want to reserve the extra reserve blocks
+ * from the reserve.
*
- * Don't set the reserved flag here - we don't want to reserve
- * the extra reserve blocks from the reserve.....
+ * The desired reserve size can change after we drop the lock.
+ * Use mod_fdblocks to put the space into the reserve or into
+ * fdblocks as appropriate.
*/
+ fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ if (!error)
+ xfs_mod_fdblocks(mp, fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
- } while (error == -ENOSPC);
-
- /*
- * Update the reserve counters if blocks have been successfully
- * allocated.
- */
- if (!error && fdblks_delta) {
- mp->m_resblks += fdblks_delta;
- mp->m_resblks_avail += fdblks_delta;
}
-
out:
if (outval) {
outval->resblks = mp->m_resblks;
@@ -433,13 +473,10 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
-
- if (sb && !IS_ERR(sb)) {
+ if (!freeze_bdev(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
- thaw_bdev(sb->s_bdev, sb);
+ thaw_bdev(mp->m_super->s_bdev);
}
-
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
@@ -461,59 +498,56 @@ xfs_fs_goingdown(
* consistent. We don't do an unmount here; just shutdown the shop, make sure
* that absolutely nothing persistent happens to this filesystem after this
* point.
+ *
+ * The shutdown state change is atomic, resulting in the first and only the
+ * first shutdown call processing the shutdown. This means we only shutdown the
+ * log once as it requires, and we don't spam the logs when multiple concurrent
+ * shutdowns race to set the shutdown flags.
*/
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
- int flags,
+ uint32_t flags,
char *fname,
int lnnum)
{
- bool logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+ int tag;
+ const char *why;
- /*
- * No need to duplicate efforts.
- */
- if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
- return;
- /*
- * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
- * queue up anybody new on the log reservations, and wakes up
- * everybody who's sleeping on log reservations to tell them
- * the bad news.
- */
- if (xfs_log_force_umount(mp, logerror))
- return;
-
- if (flags & SHUTDOWN_FORCE_UMOUNT) {
- xfs_alert(mp,
-"User initiated shutdown received. Shutting down filesystem");
+ if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
+ xlog_shutdown_wait(mp->m_log);
return;
}
-
- xfs_notice(mp,
-"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
- __func__, flags, lnnum, fname, __return_address);
-
- if (flags & SHUTDOWN_CORRUPT_INCORE) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-"Corruption of in-memory data detected. Shutting down filesystem");
- if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
- xfs_stack_trace();
- } else if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
+ if (mp->m_sb_bp)
+ mp->m_sb_bp->b_flags |= XBF_DONE;
+
+ if (flags & SHUTDOWN_FORCE_UMOUNT)
+ xfs_alert(mp, "User initiated shutdown received.");
+
+ if (xlog_force_shutdown(mp->m_log, flags)) {
+ tag = XFS_PTAG_SHUTDOWN_LOGERROR;
+ why = "Log I/O Error";
+ } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of in-memory data";
+ } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of on-disk metadata";
+ } else {
+ tag = XFS_PTAG_SHUTDOWN_IOERROR;
+ why = "Metadata I/O Error";
}
+ trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
+
+ xfs_alert_tag(mp, tag,
+"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.",
+ why, flags, __return_address, fname, lnnum);
xfs_alert(mp,
"Please unmount the filesystem and rectify the problem(s)");
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
}
/*
@@ -529,10 +563,8 @@ xfs_fs_reserve_ag_blocks(
int err2;
mp->m_finobt_nores = false;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_init(pag, NULL);
- xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
@@ -558,10 +590,8 @@ xfs_fs_unreserve_ag_blocks(
int error = 0;
int err2;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
err2 = xfs_ag_resv_free(pag);
- xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 92869f6ec8d3..2cffe51a31e8 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,8 +6,8 @@
#ifndef __XFS_FSOPS_H__
#define __XFS_FSOPS_H__
-extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
-extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
+extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
xfs_fsop_resblks_t *outval);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index fa55ab8b8d80..4d0a98f920ca 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -8,8 +8,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second) with the exception of eofb_timer and cowb_timer, which
- * are measured in seconds.
+ * 100ths of a second) with the exception of blockgc_timer, which is measured
+ * in seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
@@ -28,8 +28,7 @@ xfs_param_t xfs_params = {
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
- .eofb_timer = { 1, 300, 3600*24},
- .cowb_timer = { 1, 1800, 3600*24},
+ .blockgc_timer = { 1, 300, 3600*24},
};
struct xfs_globals xfs_globals = {
@@ -42,5 +41,6 @@ struct xfs_globals xfs_globals = {
#endif
#ifdef DEBUG
.pwork_threads = -1, /* automatic thread detection */
+ .larp = false, /* log attribute replay */
#endif
};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 8e0cb05a7142..72a075bb2c10 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -9,11 +9,11 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
#include "xfs_health.h"
+#include "xfs_ag.h"
/*
* Warn about metadata corruption that we detected but haven't fixed, and
@@ -30,18 +30,16 @@ xfs_health_unmount(
unsigned int checked = 0;
bool warn = false;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return;
/* Measure AG corruption levels. */
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xfs_perag_get(mp, agno);
+ for_each_perag(mp, agno, pag) {
xfs_ag_measure_sickness(pag, &sick, &checked);
if (sick) {
trace_xfs_ag_unfixed_corruption(mp, agno, sick);
warn = true;
}
- xfs_perag_put(pag);
}
/* Measure realtime volume corruption levels. */
@@ -231,6 +229,15 @@ xfs_inode_mark_sick(
ip->i_sick |= mask;
ip->i_checked |= mask;
spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Keep this inode around so we don't lose the sickness report. Scrub
+ * grabs inodes with DONTCACHE assuming that most inode are ok, which
+ * is not the case here.
+ */
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
}
/* Mark parts of an inode healed. */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 8dc2e5414276..eae7427062cf 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
@@ -22,9 +21,49 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_log_priv.h"
#include <linux/iversion.h>
+/* Radix tree tags for incore inode tree. */
+
+/* inode is to be reclaimed */
+#define XFS_ICI_RECLAIM_TAG 0
+/* Inode has speculative preallocations (posteof or cow) to clean. */
+#define XFS_ICI_BLOCKGC_TAG 1
+
+/*
+ * The goal for walking incore inodes. These can correspond with incore inode
+ * radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
+ */
+enum xfs_icwalk_goal {
+ /* Goals directly associated with tagged inodes. */
+ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
+ XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
+};
+
+static int xfs_icwalk(struct xfs_mount *mp,
+ enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
+static int xfs_icwalk_ag(struct xfs_perag *pag,
+ enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
+
+/*
+ * Private inode cache walk flags for struct xfs_icwalk. Must not
+ * coincide with XFS_ICWALK_FLAGS_VALID.
+ */
+
+/* Stop scanning after icw_scan_limit inodes. */
+#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
+
+#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
+#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
+
+#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \
+ XFS_ICWALK_FLAG_RECLAIM_SICK | \
+ XFS_ICWALK_FLAG_UNION)
+
/*
* Allocate and initialise an xfs_inode.
*/
@@ -36,43 +75,45 @@ xfs_inode_alloc(
struct xfs_inode *ip;
/*
- * if this didn't occur in transactions, we could use
- * KM_MAYFAIL and return NULL here on ENOMEM. Set the
- * code up to do this anyway.
+ * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
+ * and return NULL here on ENOMEM.
*/
- ip = kmem_zone_alloc(xfs_inode_zone, 0);
- if (!ip)
- return NULL;
+ ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
+
if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
return NULL;
}
- /* VFS doesn't initialise i_mode! */
+ /* VFS doesn't initialise i_mode or i_state! */
VFS_I(ip)->i_mode = 0;
+ VFS_I(ip)->i_state = 0;
+ mapping_set_large_folios(VFS_I(ip)->i_mapping);
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
/* initialise the xfs inode */
ip->i_ino = ino;
ip->i_mount = mp;
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
- ip->i_afp = NULL;
ip->i_cowfp = NULL;
- ip->i_cnextents = 0;
- ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
+ memset(&ip->i_af, 0, sizeof(ip->i_af));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
memset(&ip->i_df, 0, sizeof(ip->i_df));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
- memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
+ ip->i_nblocks = 0;
+ ip->i_forkoff = 0;
ip->i_sick = 0;
ip->i_checked = 0;
INIT_WORK(&ip->i_ioend_work, xfs_end_io);
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
return ip;
}
@@ -88,15 +129,16 @@ xfs_inode_free_callback(
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(&ip->i_df);
break;
}
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- if (ip->i_cowfp)
- xfs_idestroy_fork(ip, XFS_COW_FORK);
+ xfs_ifork_zap_attr(ip);
+ if (ip->i_cowfp) {
+ xfs_idestroy_fork(ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
+ }
if (ip->i_itemp) {
ASSERT(!test_bit(XFS_LI_IN_AIL,
&ip->i_itemp->ili_item.li_flags));
@@ -104,7 +146,7 @@ xfs_inode_free_callback(
ip->i_itemp = NULL;
}
- kmem_cache_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_cache, ip);
}
static void
@@ -113,6 +155,7 @@ __xfs_inode_free(
{
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -122,7 +165,7 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
- ASSERT(!xfs_isiflocked(ip));
+ ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -139,11 +182,8 @@ xfs_inode_free(
}
/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
+ * Queue background inode reclaim work if there are reclaimable inodes and there
+ * isn't reclaim work already scheduled or in progress.
*/
static void
xfs_reclaim_work_queue(
@@ -159,115 +199,97 @@ xfs_reclaim_work_queue(
}
/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
+ * Background scanning to trim preallocated space. This is queued based on the
+ * 'speculative_prealloc_lifetime' tunable (5m by default).
*/
-void
-xfs_reclaim_worker(
- struct work_struct *work)
+static inline void
+xfs_blockgc_queue(
+ struct xfs_perag *pag)
{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_reclaim_work);
+ struct xfs_mount *mp = pag->pag_mount;
- xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_reclaim_work_queue(mp);
+ if (!xfs_is_blockgc_enabled(mp))
+ return;
+
+ rcu_read_lock();
+ if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
+ queue_delayed_work(pag->pag_mount->m_blockgc_wq,
+ &pag->pag_blockgc_work,
+ msecs_to_jiffies(xfs_blockgc_secs * 1000));
+ rcu_read_unlock();
}
+/* Set a tag on both the AG incore inode tree and the AG radix tree. */
static void
-xfs_perag_set_reclaim_tag(
- struct xfs_perag *pag)
+xfs_perag_set_inode_tag(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ unsigned int tag)
{
struct xfs_mount *mp = pag->pag_mount;
+ bool was_tagged;
lockdep_assert_held(&pag->pag_ici_lock);
- if (pag->pag_ici_reclaimable++)
+
+ was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
+ radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
+
+ if (tag == XFS_ICI_RECLAIM_TAG)
+ pag->pag_ici_reclaimable++;
+
+ if (was_tagged)
return;
- /* propagate the reclaim tag up into the perag radix tree */
+ /* propagate the tag up into the perag radix tree */
spin_lock(&mp->m_perag_lock);
- radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
- XFS_ICI_RECLAIM_TAG);
+ radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
spin_unlock(&mp->m_perag_lock);
- /* schedule periodic background inode reclaim */
- xfs_reclaim_work_queue(mp);
+ /* start background work */
+ switch (tag) {
+ case XFS_ICI_RECLAIM_TAG:
+ xfs_reclaim_work_queue(mp);
+ break;
+ case XFS_ICI_BLOCKGC_TAG:
+ xfs_blockgc_queue(pag);
+ break;
+ }
- trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
+ trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
}
+/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
static void
-xfs_perag_clear_reclaim_tag(
- struct xfs_perag *pag)
+xfs_perag_clear_inode_tag(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ unsigned int tag)
{
struct xfs_mount *mp = pag->pag_mount;
lockdep_assert_held(&pag->pag_ici_lock);
- if (--pag->pag_ici_reclaimable)
- return;
-
- /* clear the reclaim tag from the perag radix tree */
- spin_lock(&mp->m_perag_lock);
- radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
- XFS_ICI_RECLAIM_TAG);
- spin_unlock(&mp->m_perag_lock);
- trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
-
- radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
- XFS_ICI_RECLAIM_TAG);
- xfs_perag_set_reclaim_tag(pag);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ /*
+ * Reclaim can signal (with a null agino) that it cleared its own tag
+ * by removing the inode from the radix tree.
+ */
+ if (agino != NULLAGINO)
+ radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
+ else
+ ASSERT(tag == XFS_ICI_RECLAIM_TAG);
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
-}
+ if (tag == XFS_ICI_RECLAIM_TAG)
+ pag->pag_ici_reclaimable--;
-STATIC void
-xfs_inode_clear_reclaim_tag(
- struct xfs_perag *pag,
- xfs_ino_t ino)
-{
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(pag->pag_mount, ino),
- XFS_ICI_RECLAIM_TAG);
- xfs_perag_clear_reclaim_tag(pag);
-}
+ if (radix_tree_tagged(&pag->pag_ici_root, tag))
+ return;
-static void
-xfs_inew_wait(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
+ /* clear the tag from the perag radix tree */
+ spin_lock(&mp->m_perag_lock);
+ radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
+ spin_unlock(&mp->m_perag_lock);
- do {
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (!xfs_iflags_test(ip, XFS_INEW))
- break;
- schedule();
- } while (true);
- finish_wait(wq, &wait.wq_entry);
+ trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
}
/*
@@ -283,12 +305,14 @@ xfs_reinit_inode(
struct xfs_mount *mp,
struct inode *inode)
{
- int error;
- uint32_t nlink = inode->i_nlink;
- uint32_t generation = inode->i_generation;
- uint64_t version = inode_peek_iversion(inode);
- umode_t mode = inode->i_mode;
- dev_t dev = inode->i_rdev;
+ int error;
+ uint32_t nlink = inode->i_nlink;
+ uint32_t generation = inode->i_generation;
+ uint64_t version = inode_peek_iversion(inode);
+ umode_t mode = inode->i_mode;
+ dev_t dev = inode->i_rdev;
+ kuid_t uid = inode->i_uid;
+ kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -297,10 +321,76 @@ xfs_reinit_inode(
inode_set_iversion_queried(inode, version);
inode->i_mode = mode;
inode->i_rdev = dev;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
+ mapping_set_large_folios(inode->i_mapping);
return error;
}
/*
+ * Carefully nudge an inode whose VFS state has been torn down back into a
+ * usable state. Drops the i_flags_lock and the rcu read lock.
+ */
+static int
+xfs_iget_recycle(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip) __releases(&ip->i_flags_lock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ trace_xfs_iget_recycle(ip);
+
+ /*
+ * We need to make it look like the inode is being reclaimed to prevent
+ * the actual reclaim workers from stomping over us while we recycle
+ * the inode. We can't clear the radix tree tag yet as it requires
+ * pag_ici_lock to be held exclusive.
+ */
+ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+ error = xfs_reinit_inode(mp, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ rcu_read_lock();
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ trace_xfs_iget_recycle_fail(ip);
+ return error;
+ }
+
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ /*
+ * Clear the per-lifetime state in the inode as we are now effectively
+ * a new inode and need to return to the initial state before reuse
+ * occurs.
+ */
+ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+ ip->i_flags |= XFS_INEW;
+ xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ inode->i_state = I_NEW;
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+
+ return 0;
+}
+
+/*
* If we are allocating a new inode, then check what was returned is
* actually a free, empty inode. If we are not allocating an inode,
* then check we didn't find a free inode.
@@ -324,7 +414,7 @@ xfs_iget_check_free_state(
return -EFSCORRUPTED;
}
- if (ip->i_d.di_nblocks != 0) {
+ if (ip->i_nblocks != 0) {
xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx has blocks allocated!",
ip->i_ino);
@@ -340,6 +430,21 @@ xfs_iget_check_free_state(
return 0;
}
+/* Make all pending inactivation work start immediately. */
+static void
+xfs_inodegc_queue_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list))
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+ }
+}
+
/*
* Check the validity of the inode we just found it the cache
*/
@@ -363,29 +468,37 @@ xfs_iget_cache_hit(
* will not match, so check for that, too.
*/
spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != ino) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(mp, xs_ig_frecycle);
- error = -EAGAIN;
- goto out_error;
- }
-
+ if (ip->i_ino != ino)
+ goto out_skip;
/*
* If we are racing with another cache hit that is currently
* instantiating this inode or currently recycling it out of
- * reclaimabe state, wait for the initialisation to complete
+ * reclaimable state, wait for the initialisation to complete
* before continuing.
*
+ * If we're racing with the inactivation worker we also want to wait.
+ * If we're creating a new file, it's possible that the worker
+ * previously marked the inode as free on disk but hasn't finished
+ * updating the incore state yet. The AGI buffer will be dirty and
+ * locked to the icreate transaction, so a synchronous push of the
+ * inodegc workers would result in deadlock. For a regular iget, the
+ * worker is running already, so we might as well wait.
+ *
* XXX(hch): eventually we should do something equivalent to
* wait_on_inode to wait for these flags to be cleared
* instead of polling for it.
*/
- if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(mp, xs_ig_frecycle);
- error = -EAGAIN;
- goto out_error;
+ if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
+ goto out_skip;
+
+ if (ip->i_flags & XFS_NEED_INACTIVE) {
+ /* Unlinked inodes cannot be re-grabbed. */
+ if (VFS_I(ip)->i_nlink == 0) {
+ error = -ENOENT;
+ goto out_error;
+ }
+ goto out_inodegc_flush;
}
/*
@@ -396,74 +509,21 @@ xfs_iget_cache_hit(
if (error)
goto out_error;
- /*
- * If IRECLAIMABLE is set, we've torn down the VFS inode already.
- * Need to carefully get it back into useable state.
- */
- if (ip->i_flags & XFS_IRECLAIMABLE) {
- trace_xfs_iget_reclaim(ip);
-
- if (flags & XFS_IGET_INCORE) {
- error = -EAGAIN;
- goto out_error;
- }
-
- /*
- * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
- * from stomping over us while we recycle the inode. We can't
- * clear the radix tree reclaimable tag yet as it requires
- * pag_ici_lock to be held exclusive.
- */
- ip->i_flags |= XFS_IRECLAIM;
-
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
-
- error = xfs_reinit_inode(mp, inode);
- if (error) {
- bool wake;
- /*
- * Re-initializing the inode failed, and we are in deep
- * trouble. Try to re-add it to the reclaim list.
- */
- rcu_read_lock();
- spin_lock(&ip->i_flags_lock);
- wake = !!__xfs_iflags_test(ip, XFS_INEW);
- ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
- if (wake)
- wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
- ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
- trace_xfs_iget_reclaim_fail(ip);
- goto out_error;
- }
-
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
-
- /*
- * Clear the per-lifetime state in the inode as we are now
- * effectively a new inode and need to return to the initial
- * state before reuse occurs.
- */
- ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
- ip->i_flags |= XFS_INEW;
- xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
- inode->i_state = I_NEW;
- ip->i_sick = 0;
- ip->i_checked = 0;
-
- ASSERT(!rwsem_is_locked(&inode->i_rwsem));
- init_rwsem(&inode->i_rwsem);
+ /* Skip inodes that have no vfs state. */
+ if ((flags & XFS_IGET_INCORE) &&
+ (ip->i_flags & XFS_IRECLAIMABLE))
+ goto out_skip;
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
+ /* The inode fits the selection criteria; process it. */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ /* Drops i_flags_lock and RCU read lock. */
+ error = xfs_iget_recycle(pag, ip);
+ if (error)
+ return error;
} else {
/* If the VFS inode is being torn down, pause and try again. */
- if (!igrab(inode)) {
- trace_xfs_iget_skip(ip);
- error = -EAGAIN;
- goto out_error;
- }
+ if (!igrab(inode))
+ goto out_skip;
/* We've got a live one. */
spin_unlock(&ip->i_flags_lock);
@@ -475,17 +535,31 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
if (!(flags & XFS_IGET_INCORE))
- xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ xfs_iflags_clear(ip, XFS_ISTALE);
XFS_STATS_INC(mp, xs_ig_found);
return 0;
+out_skip:
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
+ error = -EAGAIN;
out_error:
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
return error;
-}
+out_inodegc_flush:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ /*
+ * Do not wait for the workers, because the caller could hold an AGI
+ * buffer lock. We're just going to sleep in a loop anyway.
+ */
+ if (xfs_is_inodegc_enabled(mp))
+ xfs_inodegc_queue_all(mp);
+ return -EAGAIN;
+}
static int
xfs_iget_cache_miss(
@@ -506,18 +580,42 @@ xfs_iget_cache_miss(
if (!ip)
return -ENOMEM;
- error = xfs_iread(mp, tp, ip, flags);
+ error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
if (error)
goto out_destroy;
- if (!xfs_inode_verify_forks(ip)) {
- error = -EFSCORRUPTED;
- goto out_destroy;
+ /*
+ * For version 5 superblocks, if we are initialising a new inode and we
+ * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
+ * simply build the new inode core with a random generation number.
+ *
+ * For version 4 (and older) superblocks, log recovery is dependent on
+ * the i_flushiter field being initialised from the current on-disk
+ * value and hence we must also read the inode off disk even when
+ * initializing new inodes.
+ */
+ if (xfs_has_v3inodes(mp) &&
+ (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+ VFS_I(ip)->i_generation = get_random_u32();
+ } else {
+ struct xfs_buf *bp;
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
+ if (error)
+ goto out_destroy;
+
+ error = xfs_inode_from_disk(ip,
+ xfs_buf_offset(bp, ip->i_imap.im_boffset));
+ if (!error)
+ xfs_buf_set_ref(bp, XFS_INO_REF);
+ xfs_trans_brelse(tp, bp);
+
+ if (error)
+ goto out_destroy;
}
trace_xfs_iget_miss(ip);
-
/*
* Check the inode free state is valid. This also detects lookup
* racing with unlinks.
@@ -557,7 +655,7 @@ xfs_iget_cache_miss(
*/
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
- iflags |= XFS_IDONTCACHE;
+ d_mark_dontcache(VFS_I(ip));
ip->i_udquot = NULL;
ip->i_gdquot = NULL;
ip->i_pdquot = NULL;
@@ -590,48 +688,31 @@ out_destroy:
}
/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
+ * Look up an inode by number in the given file system. The inode is looked up
+ * in the cache held in each AG. If the inode is found in the cache, initialise
+ * the vfs inode if necessary.
*
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
+ * If it is not in core, read it in from the file system's device, add it to the
+ * cache and initialise the vfs inode.
*
* The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
+ * Inode lookup is only done during metadata operations and not as part of the
+ * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
*/
int
xfs_iget(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp)
-{
- xfs_inode_t *ip;
- int error;
- xfs_perag_t *pag;
- xfs_agino_t agino;
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ struct xfs_inode **ipp)
+{
+ struct xfs_inode *ip;
+ struct xfs_perag *pag;
+ xfs_agino_t agino;
+ int error;
- /*
- * xfs_reclaim_inode() uses the ILOCK to ensure an inode
- * doesn't get freed while it's being referenced during a
- * radix tree traversal here. It assumes this function
- * aqcuires only the ILOCK (and therefore it has no need to
- * involve the IOLOCK in this synchronization).
- */
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
/* reject inode numbers outside existing AGs */
@@ -672,7 +753,8 @@ again:
/*
* If we have a real type for an on-disk inode, we can setup the inode
- * now. If it's a new inode being created, xfs_ialloc will handle it.
+ * now. If it's a new inode being created, xfs_init_new_inode will
+ * handle it.
*/
if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
xfs_setup_existing_inode(ip);
@@ -726,445 +808,93 @@ xfs_icache_inode_is_allocated(
}
/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH 32
-
-STATIC int
-xfs_inode_ag_walk_grab(
- struct xfs_inode *ip,
- int flags)
-{
- struct inode *inode = VFS_I(ip);
- bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
-
- ASSERT(rcu_read_lock_held());
-
- /*
- * check for stale RCU freed inode
- *
- * If the inode has been reallocated, it doesn't matter if it's not in
- * the AG we are walking - we are walking for writeback, so if it
- * passes all the "valid inode" checks and is dirty, then we'll write
- * it back anyway. If it has been reallocated and still being
- * initialised, the XFS_INEW check below will catch it.
- */
- spin_lock(&ip->i_flags_lock);
- if (!ip->i_ino)
- goto out_unlock_noent;
-
- /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
- __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
- goto out_unlock_noent;
- spin_unlock(&ip->i_flags_lock);
-
- /* nothing to sync during shutdown */
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EFSCORRUPTED;
-
- /* If we can't grab the inode, it must on it's way to reclaim. */
- if (!igrab(inode))
- return -ENOENT;
-
- /* inode is valid */
- return 0;
-
-out_unlock_noent:
- spin_unlock(&ip->i_flags_lock);
- return -ENOENT;
-}
-
-STATIC int
-xfs_inode_ag_walk(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int tag,
- int iter_flags)
-{
- uint32_t first_index;
- int last_error = 0;
- int skipped;
- int done;
- int nr_found;
-
-restart:
- done = 0;
- skipped = 0;
- first_index = 0;
- nr_found = 0;
- do {
- struct xfs_inode *batch[XFS_LOOKUP_BATCH];
- int error = 0;
- int i;
-
- rcu_read_lock();
-
- if (tag == -1)
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
- (void **)batch, first_index,
- XFS_LOOKUP_BATCH);
- else
- nr_found = radix_tree_gang_lookup_tag(
- &pag->pag_ici_root,
- (void **) batch, first_index,
- XFS_LOOKUP_BATCH, tag);
-
- if (!nr_found) {
- rcu_read_unlock();
- break;
- }
-
- /*
- * Grab the inodes before we drop the lock. if we found
- * nothing, nr == 0 and the loop will be skipped.
- */
- for (i = 0; i < nr_found; i++) {
- struct xfs_inode *ip = batch[i];
-
- if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
- batch[i] = NULL;
-
- /*
- * Update the index for the next lookup. Catch
- * overflows into the next AG range which can occur if
- * we have inodes in the last block of the AG and we
- * are currently pointing to the last inode.
- *
- * Because we may see inodes that are from the wrong AG
- * due to RCU freeing and reallocation, only update the
- * index if it lies in this AG. It was a race that lead
- * us to see this inode, so another lookup from the
- * same index will not find it again.
- */
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
- continue;
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
- }
-
- /* unlock now we've grabbed the inodes. */
- rcu_read_unlock();
-
- for (i = 0; i < nr_found; i++) {
- if (!batch[i])
- continue;
- if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
- xfs_iflags_test(batch[i], XFS_INEW))
- xfs_inew_wait(batch[i]);
- error = execute(batch[i], flags, args);
- xfs_irele(batch[i]);
- if (error == -EAGAIN) {
- skipped++;
- continue;
- }
- if (error && last_error != -EFSCORRUPTED)
- last_error = error;
- }
-
- /* bail out if the filesystem is corrupted. */
- if (error == -EFSCORRUPTED)
- break;
-
- cond_resched();
-
- } while (nr_found && !done);
-
- if (skipped) {
- delay(1);
- goto restart;
- }
- return last_error;
-}
-
-/*
- * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
- */
-void
-xfs_queue_eofblocks(
- struct xfs_mount *mp)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
- queue_delayed_work(mp->m_eofblocks_workqueue,
- &mp->m_eofblocks_work,
- msecs_to_jiffies(xfs_eofb_secs * 1000));
- rcu_read_unlock();
-}
-
-void
-xfs_eofblocks_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_eofblocks_work);
- xfs_icache_free_eofblocks(mp, NULL);
- xfs_queue_eofblocks(mp);
-}
-
-/*
- * Background scanning to trim preallocated CoW space. This is queued
- * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
- * (We'll just piggyback on the post-EOF prealloc space workqueue.)
- */
-void
-xfs_queue_cowblocks(
- struct xfs_mount *mp)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
- queue_delayed_work(mp->m_eofblocks_workqueue,
- &mp->m_cowblocks_work,
- msecs_to_jiffies(xfs_cowb_secs * 1000));
- rcu_read_unlock();
-}
-
-void
-xfs_cowblocks_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_cowblocks_work);
- xfs_icache_free_cowblocks(mp, NULL);
- xfs_queue_cowblocks(mp);
-}
-
-int
-xfs_inode_ag_iterator_flags(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int iter_flags)
-{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
-
- ag = 0;
- while ((pag = xfs_perag_get(mp, ag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
- iter_flags);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args)
-{
- return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
-}
-
-int
-xfs_inode_ag_iterator_tag(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int tag)
-{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
-
- ag = 0;
- while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
- 0);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
-}
-
-/*
* Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
+ *
+ * We have found this inode via a lookup under RCU, so the inode may have
+ * already been freed, or it may be in the process of being recycled by
+ * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
+ * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
+ * will not be set. Hence we need to check for both these flag conditions to
+ * avoid inodes that are no longer reclaim candidates.
+ *
+ * Note: checking for other state flags here, under the i_flags_lock or not, is
+ * racy and should be avoided. Those races should be resolved only after we have
+ * ensured that we are able to reclaim this inode and the world can see that we
+ * are going to reclaim it.
+ *
+ * Return true if we grabbed it, false otherwise.
*/
-STATIC int
-xfs_reclaim_inode_grab(
+static bool
+xfs_reclaim_igrab(
struct xfs_inode *ip,
- int flags)
+ struct xfs_icwalk *icw)
{
ASSERT(rcu_read_lock_held());
- /* quick check for stale RCU freed inode */
- if (!ip->i_ino)
- return 1;
-
- /*
- * If we are asked for non-blocking operation, do unlocked checks to
- * see if the inode already is being flushed or in reclaim to avoid
- * lock traffic.
- */
- if ((flags & SYNC_TRYLOCK) &&
- __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
- return 1;
-
- /*
- * The radix tree lock here protects a thread in xfs_iget from racing
- * with us starting reclaim on the inode. Once we have the
- * XFS_IRECLAIM flag set it will not touch us.
- *
- * Due to RCU lookup, we may find inodes that have been freed and only
- * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
- * aren't candidates for reclaim at all, so we must check the
- * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
- */
spin_lock(&ip->i_flags_lock);
if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock);
- return 1;
+ return false;
}
+
+ /* Don't reclaim a sick inode unless the caller asked for it. */
+ if (ip->i_sick &&
+ (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
+ spin_unlock(&ip->i_flags_lock);
+ return false;
+ }
+
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
- return 0;
+ return true;
}
/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- * inode state iflush ret required action
- * --------------- ---------- ---------------
- * bad - reclaim
- * shutdown EIO unpin and reclaim
- * clean, unpinned 0 reclaim
- * stale, unpinned 0 reclaim
- * clean, pinned(*) 0 requeue
- * stale, pinned EAGAIN requeue
- * dirty, async - requeue
- * dirty, sync 0 reclaim
+ * Inode reclaim is non-blocking, so the default action if progress cannot be
+ * made is to "requeue" the inode for reclaim by unlocking it and clearing the
+ * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
+ * blocking anymore and hence we can wait for the inode to be able to reclaim
+ * it.
*
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies. Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting. For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- * bad => reclaim
- * shutdown => unpin and reclaim
- * pinned, async => requeue
- * pinned, sync => unpin
- * stale => reclaim
- * clean => reclaim
- * dirty, async => requeue
- * dirty, sync => flush, wait and reclaim
+ * We do no IO here - if callers require inodes to be cleaned they must push the
+ * AIL first to trigger writeback of dirty inodes. This enables writeback to be
+ * done in the background in a non-blocking manner, and enables memory reclaim
+ * to make progress without blocking.
*/
-STATIC int
+static void
xfs_reclaim_inode(
struct xfs_inode *ip,
- struct xfs_perag *pag,
- int sync_mode)
+ struct xfs_perag *pag)
{
- struct xfs_buf *bp = NULL;
xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
- int error;
-
-restart:
- error = 0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (!xfs_iflock_nowait(ip)) {
- if (!(sync_mode & SYNC_WAIT))
- goto out;
- xfs_iflock(ip);
- }
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- xfs_iunpin_wait(ip);
- /* xfs_iflush_abort() drops the flush lock */
- xfs_iflush_abort(ip, false);
- goto reclaim;
- }
- if (xfs_ipincount(ip)) {
- if (!(sync_mode & SYNC_WAIT))
- goto out_ifunlock;
- xfs_iunpin_wait(ip);
- }
- if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- goto reclaim;
- }
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ goto out;
+ if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
+ goto out_iunlock;
/*
- * Never flush out dirty data during non-blocking reclaim, as it would
- * just contend with AIL pushing trying to do the same job.
+ * Check for log shutdown because aborting the inode can move the log
+ * tail and corrupt in memory state. This is fine if the log is shut
+ * down, but if the log is still active and only the mount is shut down
+ * then the in-memory log tail movement caused by the abort can be
+ * incorrectly propagated to disk.
*/
- if (!(sync_mode & SYNC_WAIT))
- goto out_ifunlock;
-
- /*
- * Now we have an inode that needs flushing.
- *
- * Note that xfs_iflush will never block on the inode buffer lock, as
- * xfs_ifree_cluster() can lock the inode buffer before it locks the
- * ip->i_lock, and we are doing the exact opposite here. As a result,
- * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
- * result in an ABBA deadlock with xfs_ifree_cluster().
- *
- * As xfs_ifree_cluser() must gather all inodes that are active in the
- * cache to mark them stale, if we hit this case we don't actually want
- * to do IO here - we want the inode marked stale so we can simply
- * reclaim it. Hence if we get an EAGAIN error here, just unlock the
- * inode, back off and try again. Hopefully the next pass through will
- * see the stale flag set on the inode.
- */
- error = xfs_iflush(ip, &bp);
- if (error == -EAGAIN) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /* backoff longer than in xfs_ifree_cluster */
- delay(2);
- goto restart;
- }
-
- if (!error) {
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
+ if (xlog_is_shutdown(ip->i_mount->m_log)) {
+ xfs_iunpin_wait(ip);
+ xfs_iflush_shutdown_abort(ip);
+ goto reclaim;
}
+ if (xfs_ipincount(ip))
+ goto out_clear_flush;
+ if (!xfs_inode_clean(ip))
+ goto out_clear_flush;
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
reclaim:
- ASSERT(!xfs_isiflocked(ip));
+ trace_xfs_inode_reclaiming(ip);
/*
* Because we use RCU freeing we need to ensure the inode always appears
@@ -1179,8 +909,11 @@ reclaim:
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
+ ip->i_sick = 0;
+ ip->i_checked = 0;
spin_unlock(&ip->i_flags_lock);
+ ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1195,7 +928,7 @@ reclaim:
if (!radix_tree_delete(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ino)))
ASSERT(0);
- xfs_perag_clear_reclaim_tag(pag);
+ xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
spin_unlock(&pag->pag_ici_lock);
/*
@@ -1207,195 +940,86 @@ reclaim:
* unlocked after the lookup before we go ahead and free it.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_qm_dqdetach(ip);
+ ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ ASSERT(xfs_inode_clean(ip));
__xfs_inode_free(ip);
- return error;
+ return;
-out_ifunlock:
- xfs_ifunlock(ip);
+out_clear_flush:
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+out_iunlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
xfs_iflags_clear(ip, XFS_IRECLAIM);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * We could return -EAGAIN here to make reclaim rescan the inode tree in
- * a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and the reclaim work never goes back to
- * the idle state. Instead, return 0 to let the next scheduled
- * background reclaim attempt to reclaim the inode again.
- */
- return 0;
}
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- */
-STATIC int
-xfs_reclaim_inodes_ag(
- struct xfs_mount *mp,
- int flags,
- int *nr_to_scan)
+/* Reclaim sick inodes if we're unmounting or the fs went down. */
+static inline bool
+xfs_want_reclaim_sick(
+ struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
- int trylock = flags & SYNC_TRYLOCK;
- int skipped;
-
-restart:
- ag = 0;
- skipped = 0;
- while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
- unsigned long first_index = 0;
- int done = 0;
- int nr_found = 0;
-
- ag = pag->pag_agno + 1;
-
- if (trylock) {
- if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
- skipped++;
- xfs_perag_put(pag);
- continue;
- }
- first_index = pag->pag_ici_reclaim_cursor;
- } else
- mutex_lock(&pag->pag_ici_reclaim_lock);
-
- do {
- struct xfs_inode *batch[XFS_LOOKUP_BATCH];
- int i;
-
- rcu_read_lock();
- nr_found = radix_tree_gang_lookup_tag(
- &pag->pag_ici_root,
- (void **)batch, first_index,
- XFS_LOOKUP_BATCH,
- XFS_ICI_RECLAIM_TAG);
- if (!nr_found) {
- done = 1;
- rcu_read_unlock();
- break;
- }
-
- /*
- * Grab the inodes before we drop the lock. if we found
- * nothing, nr == 0 and the loop will be skipped.
- */
- for (i = 0; i < nr_found; i++) {
- struct xfs_inode *ip = batch[i];
-
- if (done || xfs_reclaim_inode_grab(ip, flags))
- batch[i] = NULL;
-
- /*
- * Update the index for the next lookup. Catch
- * overflows into the next AG range which can
- * occur if we have inodes in the last block of
- * the AG and we are currently pointing to the
- * last inode.
- *
- * Because we may see inodes that are from the
- * wrong AG due to RCU freeing and
- * reallocation, only update the index if it
- * lies in this AG. It was a race that lead us
- * to see this inode, so another lookup from
- * the same index will not find it again.
- */
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
- pag->pag_agno)
- continue;
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
- if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
- }
-
- /* unlock now we've grabbed the inodes. */
- rcu_read_unlock();
-
- for (i = 0; i < nr_found; i++) {
- if (!batch[i])
- continue;
- error = xfs_reclaim_inode(batch[i], pag, flags);
- if (error && last_error != -EFSCORRUPTED)
- last_error = error;
- }
-
- *nr_to_scan -= XFS_LOOKUP_BATCH;
-
- cond_resched();
-
- } while (nr_found && !done && *nr_to_scan > 0);
-
- if (trylock && !done)
- pag->pag_ici_reclaim_cursor = first_index;
- else
- pag->pag_ici_reclaim_cursor = 0;
- mutex_unlock(&pag->pag_ici_reclaim_lock);
- xfs_perag_put(pag);
- }
-
- /*
- * if we skipped any AG, and we still have scan count remaining, do
- * another pass this time using blocking reclaim semantics (i.e
- * waiting on the reclaim locks and ignoring the reclaim cursors). This
- * ensure that when we get more reclaimers than AGs we block rather
- * than spin trying to execute reclaim.
- */
- if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
- trylock = 0;
- goto restart;
- }
- return last_error;
+ return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
+ xfs_is_shutdown(mp);
}
-int
+void
xfs_reclaim_inodes(
- xfs_mount_t *mp,
- int mode)
+ struct xfs_mount *mp)
{
- int nr_to_scan = INT_MAX;
+ struct xfs_icwalk icw = {
+ .icw_flags = 0,
+ };
- return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+ if (xfs_want_reclaim_sick(mp))
+ icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
+
+ while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
+ }
}
/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
+ * The shrinker infrastructure determines how many inodes we should scan for
+ * reclaim. We want as many clean inodes ready to reclaim as possible, so we
+ * push the AIL here. We also want to proactively free up memory if we can to
+ * minimise the amount of work memory reclaim has to do so we kick the
+ * background reclaim if it isn't already scheduled.
*/
long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
- int nr_to_scan)
+ unsigned long nr_to_scan)
{
+ struct xfs_icwalk icw = {
+ .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
+ .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
+ };
+
+ if (xfs_want_reclaim_sick(mp))
+ icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
+
/* kick background reclaimer and push the AIL */
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
+ return 0;
}
/*
* Return the number of reclaimable inodes in the filesystem for
* the shrinker to determine how much to reclaim.
*/
-int
+long
xfs_reclaim_inodes_count(
struct xfs_mount *mp)
{
struct xfs_perag *pag;
xfs_agnumber_t ag = 0;
- int reclaimable = 0;
+ long reclaimable = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
ag = pag->pag_agno + 1;
@@ -1405,239 +1029,165 @@ xfs_reclaim_inodes_count(
return reclaimable;
}
-STATIC int
-xfs_inode_match_id(
+STATIC bool
+xfs_icwalk_match_id(
struct xfs_inode *ip,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
- if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
- !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
- return 0;
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
+ !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
+ return false;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
- !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
- return 0;
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
+ !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
+ return false;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- ip->i_d.di_projid != eofb->eof_prid)
- return 0;
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
+ ip->i_projid != icw->icw_prid)
+ return false;
- return 1;
+ return true;
}
/*
* A union-based inode filtering algorithm. Process the inode if any of the
* criteria match. This is for global/internal scans only.
*/
-STATIC int
-xfs_inode_match_id_union(
+STATIC bool
+xfs_icwalk_match_id_union(
struct xfs_inode *ip,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
- if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
- uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
- return 1;
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
+ uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
+ return true;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
- gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
- return 1;
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
+ gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
+ return true;
- if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- ip->i_d.di_projid == eofb->eof_prid)
- return 1;
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
+ ip->i_projid == icw->icw_prid)
+ return true;
- return 0;
+ return false;
+}
+
+/*
+ * Is this inode @ip eligible for eof/cow block reclamation, given some
+ * filtering parameters @icw? The inode is eligible if @icw is null or
+ * if the predicate functions match.
+ */
+static bool
+xfs_icwalk_match(
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
+{
+ bool match;
+
+ if (!icw)
+ return true;
+
+ if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
+ match = xfs_icwalk_match_id_union(ip, icw);
+ else
+ match = xfs_icwalk_match_id(ip, icw);
+ if (!match)
+ return false;
+
+ /* skip the inode if the file size is too small */
+ if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
+ XFS_ISIZE(ip) < icw->icw_min_file_size)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low.
+ */
+void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+
+ xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
+ xfs_reclaim_work_queue(mp);
}
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- int flags,
- void *args)
+ struct xfs_icwalk *icw,
+ unsigned int *lockflags)
{
- int ret = 0;
- struct xfs_eofblocks *eofb = args;
- int match;
+ bool wait;
- if (!xfs_can_free_eofblocks(ip, false)) {
- /* inode could be preallocated or append-only */
- trace_xfs_inode_free_eofblocks_invalid(ip);
- xfs_inode_clear_eofblocks_tag(ip);
+ wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
+
+ if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
return 0;
- }
/*
* If the mapping is dirty the operation can block and wait for some
* time. Unless we are waiting, skip it.
*/
- if (!(flags & SYNC_WAIT) &&
- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+ if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
return 0;
- if (eofb) {
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
- else
- match = xfs_inode_match_id(ip, eofb);
- if (!match)
- return 0;
-
- /* skip the inode if the file size is too small */
- if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
- return 0;
- }
+ if (!xfs_icwalk_match(ip, icw))
+ return 0;
/*
* If the caller is waiting, return -EAGAIN to keep the background
* scanner moving and revisit the inode in a subsequent pass.
*/
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- if (flags & SYNC_WAIT)
- ret = -EAGAIN;
- return ret;
+ if (wait)
+ return -EAGAIN;
+ return 0;
}
- ret = xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
- return ret;
-}
-
-static int
-__xfs_icache_free_eofblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int tag)
-{
- int flags = SYNC_TRYLOCK;
+ *lockflags |= XFS_IOLOCK_EXCL;
- if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
- flags = SYNC_WAIT;
+ if (xfs_can_free_eofblocks(ip, false))
+ return xfs_free_eofblocks(ip);
- return xfs_inode_ag_iterator_tag(mp, execute, flags,
- eofb, tag);
-}
-
-int
-xfs_icache_free_eofblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
-{
- return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
- XFS_ICI_EOFBLOCKS_TAG);
+ /* inode could be preallocated or append-only */
+ trace_xfs_inode_free_eofblocks_invalid(ip);
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
}
-/*
- * Run eofblocks scans on the quotas applicable to the inode. For inodes with
- * multiple quotas, we don't know exactly which quota caused an allocation
- * failure. We make a best effort by including each quota under low free space
- * conditions (less than 1% free space) in the scan.
- */
-static int
-__xfs_inode_free_quota_eofblocks(
+static void
+xfs_blockgc_set_iflag(
struct xfs_inode *ip,
- int (*execute)(struct xfs_mount *mp,
- struct xfs_eofblocks *eofb))
+ unsigned long iflag)
{
- int scan = 0;
- struct xfs_eofblocks eofb = {0};
- struct xfs_dquot *dq;
-
- /*
- * Run a sync scan to increase effectiveness and use the union filter to
- * cover all applicable quotas in a single scan.
- */
- eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
-
- if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQ_USER);
- if (dq && xfs_dquot_lowsp(dq)) {
- eofb.eof_uid = VFS_I(ip)->i_uid;
- eofb.eof_flags |= XFS_EOF_FLAGS_UID;
- scan = 1;
- }
- }
-
- if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
- if (dq && xfs_dquot_lowsp(dq)) {
- eofb.eof_gid = VFS_I(ip)->i_gid;
- eofb.eof_flags |= XFS_EOF_FLAGS_GID;
- scan = 1;
- }
- }
-
- if (scan)
- execute(ip->i_mount, &eofb);
-
- return scan;
-}
-
-int
-xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip)
-{
- return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
-}
-
-static inline unsigned long
-xfs_iflag_for_tag(
- int tag)
-{
- switch (tag) {
- case XFS_ICI_EOFBLOCKS_TAG:
- return XFS_IEOFBLOCKS;
- case XFS_ICI_COWBLOCKS_TAG:
- return XFS_ICOWBLOCKS;
- default:
- ASSERT(0);
- return 0;
- }
-}
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
-static void
-__xfs_inode_set_blocks_tag(
- xfs_inode_t *ip,
- void (*execute)(struct xfs_mount *mp),
- void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
- int error, unsigned long caller_ip),
- int tag)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- int tagged;
+ ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
/*
* Don't bother locking the AG and looking up in the radix trees
* if we already know that we have the tag set.
*/
- if (ip->i_flags & xfs_iflag_for_tag(tag))
+ if (ip->i_flags & iflag)
return;
spin_lock(&ip->i_flags_lock);
- ip->i_flags |= xfs_iflag_for_tag(tag);
+ ip->i_flags |= iflag;
spin_unlock(&ip->i_flags_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
- if (!tagged) {
- /* propagate the eofblocks tag up into the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_set(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- tag);
- spin_unlock(&ip->i_mount->m_perag_lock);
-
- /* kick off background trimming */
- execute(ip->i_mount);
-
- set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
- }
+ xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
@@ -1648,39 +1198,33 @@ xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_eofblocks_tag(ip);
- return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
- trace_xfs_perag_set_eofblocks,
- XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
}
static void
-__xfs_inode_clear_blocks_tag(
- xfs_inode_t *ip,
- void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
- int error, unsigned long caller_ip),
- int tag)
+xfs_blockgc_clear_iflag(
+ struct xfs_inode *ip,
+ unsigned long iflag)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ bool clear_tag;
+
+ ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
spin_lock(&ip->i_flags_lock);
- ip->i_flags &= ~xfs_iflag_for_tag(tag);
+ ip->i_flags &= ~iflag;
+ clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
spin_unlock(&ip->i_flags_lock);
+ if (!clear_tag)
+ return;
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
- if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
- /* clear the eofblocks tag from the perag radix tree */
- spin_lock(&ip->i_mount->m_perag_lock);
- radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
- XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- tag);
- spin_unlock(&ip->i_mount->m_perag_lock);
- clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
- }
+ xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
@@ -1691,8 +1235,7 @@ xfs_inode_clear_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_eofblocks_tag(ip);
- return __xfs_inode_clear_blocks_tag(ip,
- trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
}
/*
@@ -1742,33 +1285,41 @@ xfs_prep_free_cowblocks(
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
- int flags,
- void *args)
+ struct xfs_icwalk *icw,
+ unsigned int *lockflags)
{
- struct xfs_eofblocks *eofb = args;
- int match;
+ bool wait;
int ret = 0;
+ wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
+
+ if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
+ return 0;
+
if (!xfs_prep_free_cowblocks(ip))
return 0;
- if (eofb) {
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
- else
- match = xfs_inode_match_id(ip, eofb);
- if (!match)
- return 0;
-
- /* skip the inode if the file size is too small */
- if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
- return 0;
+ if (!xfs_icwalk_match(ip, icw))
+ return 0;
+
+ /*
+ * If the caller is waiting, return -EAGAIN to keep the background
+ * scanner moving and revisit the inode in a subsequent pass.
+ */
+ if (!(*lockflags & XFS_IOLOCK_EXCL) &&
+ !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (wait)
+ return -EAGAIN;
+ return 0;
}
+ *lockflags |= XFS_IOLOCK_EXCL;
- /* Free the CoW blocks */
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
+ if (wait)
+ return -EAGAIN;
+ return 0;
+ }
+ *lockflags |= XFS_MMAPLOCK_EXCL;
/*
* Check again, nobody else should be able to dirty blocks or change
@@ -1776,62 +1327,899 @@ xfs_inode_free_cowblocks(
*/
if (xfs_prep_free_cowblocks(ip))
ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
+ return ret;
+}
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+void
+xfs_inode_set_cowblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_set_cowblocks_tag(ip);
+ return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
+}
- return ret;
+void
+xfs_inode_clear_cowblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_clear_cowblocks_tag(ip);
+ return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
}
+/* Disable post-EOF and CoW block auto-reclamation. */
+void
+xfs_blockgc_stop(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ if (!xfs_clear_blockgc_enabled(mp))
+ return;
+
+ for_each_perag(mp, agno, pag)
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ trace_xfs_blockgc_stop(mp, __return_address);
+}
+
+/* Enable post-EOF and CoW block auto-reclamation. */
+void
+xfs_blockgc_start(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ if (xfs_set_blockgc_enabled(mp))
+ return;
+
+ trace_xfs_blockgc_start(mp, __return_address);
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ xfs_blockgc_queue(pag);
+}
+
+/* Don't try to run block gc on an inode that's in any of these states. */
+#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \
+ XFS_NEED_INACTIVE | \
+ XFS_INACTIVATING | \
+ XFS_IRECLAIMABLE | \
+ XFS_IRECLAIM)
+/*
+ * Decide if the given @ip is eligible for garbage collection of speculative
+ * preallocations, and grab it if so. Returns true if it's ready to go or
+ * false if we should just ignore it.
+ */
+static bool
+xfs_blockgc_igrab(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(rcu_read_lock_held());
+
+ /* Check for stale RCU freed inode */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino)
+ goto out_unlock_noent;
+
+ if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
+ goto out_unlock_noent;
+ spin_unlock(&ip->i_flags_lock);
+
+ /* nothing to sync during shutdown */
+ if (xfs_is_shutdown(ip->i_mount))
+ return false;
+
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ return false;
+
+ /* inode is valid */
+ return true;
+
+out_unlock_noent:
+ spin_unlock(&ip->i_flags_lock);
+ return false;
+}
+
+/* Scan one incore inode for block preallocations that we can remove. */
+static int
+xfs_blockgc_scan_inode(
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
+{
+ unsigned int lockflags = 0;
+ int error;
+
+ error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
+ if (error)
+ goto unlock;
+
+ error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
+unlock:
+ if (lockflags)
+ xfs_iunlock(ip, lockflags);
+ xfs_irele(ip);
+ return error;
+}
+
+/* Background worker that trims preallocated space. */
+void
+xfs_blockgc_worker(
+ struct work_struct *work)
+{
+ struct xfs_perag *pag = container_of(to_delayed_work(work),
+ struct xfs_perag, pag_blockgc_work);
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
+
+ trace_xfs_blockgc_worker(mp, __return_address);
+
+ error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
+ if (error)
+ xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
+ pag->pag_agno, error);
+ xfs_blockgc_queue(pag);
+}
+
+/*
+ * Try to free space in the filesystem by purging inactive inodes, eofblocks
+ * and cowblocks.
+ */
int
-xfs_icache_free_cowblocks(
+xfs_blockgc_free_space(
struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
+ struct xfs_icwalk *icw)
{
- return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
- XFS_ICI_COWBLOCKS_TAG);
+ int error;
+
+ trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
+
+ error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
+ if (error)
+ return error;
+
+ xfs_inodegc_flush(mp);
+ return 0;
}
+/*
+ * Reclaim all the free space that we can by scheduling the background blockgc
+ * and inodegc workers immediately and waiting for them all to clear.
+ */
+void
+xfs_blockgc_flush_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ trace_xfs_blockgc_flush_all(mp, __return_address);
+
+ /*
+ * For each blockgc worker, move its queue time up to now. If it
+ * wasn't queued, it will not be requeued. Then flush whatever's
+ * left.
+ */
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ mod_delayed_work(pag->pag_mount->m_blockgc_wq,
+ &pag->pag_blockgc_work, 0);
+
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ flush_delayed_work(&pag->pag_blockgc_work);
+
+ xfs_inodegc_flush(mp);
+}
+
+/*
+ * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
+ * quota caused an allocation failure, so we make a best effort by including
+ * each quota under low free space conditions (less than 1% free space) in the
+ * scan.
+ *
+ * Callers must not hold any inode's ILOCK. If requesting a synchronous scan
+ * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
+ * MMAPLOCK.
+ */
int
-xfs_inode_free_quota_cowblocks(
- struct xfs_inode *ip)
+xfs_blockgc_free_dquots(
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ unsigned int iwalk_flags)
+{
+ struct xfs_icwalk icw = {0};
+ bool do_work = false;
+
+ if (!udqp && !gdqp && !pdqp)
+ return 0;
+
+ /*
+ * Run a scan to free blocks using the union filter to cover all
+ * applicable quotas in a single scan.
+ */
+ icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
+
+ if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
+ icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
+ icw.icw_flags |= XFS_ICWALK_FLAG_UID;
+ do_work = true;
+ }
+
+ if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
+ icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
+ icw.icw_flags |= XFS_ICWALK_FLAG_GID;
+ do_work = true;
+ }
+
+ if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
+ icw.icw_prid = pdqp->q_id;
+ icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
+ do_work = true;
+ }
+
+ if (!do_work)
+ return 0;
+
+ return xfs_blockgc_free_space(mp, &icw);
+}
+
+/* Run cow/eofblocks scans on the quotas attached to the inode. */
+int
+xfs_blockgc_free_quota(
+ struct xfs_inode *ip,
+ unsigned int iwalk_flags)
+{
+ return xfs_blockgc_free_dquots(ip->i_mount,
+ xfs_inode_dquot(ip, XFS_DQTYPE_USER),
+ xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
+ xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
+}
+
+/* XFS Inode Cache Walking Code */
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH 32
+
+
+/*
+ * Decide if we want to grab this inode in anticipation of doing work towards
+ * the goal.
+ */
+static inline bool
+xfs_icwalk_igrab(
+ enum xfs_icwalk_goal goal,
+ struct xfs_inode *ip,
+ struct xfs_icwalk *icw)
+{
+ switch (goal) {
+ case XFS_ICWALK_BLOCKGC:
+ return xfs_blockgc_igrab(ip);
+ case XFS_ICWALK_RECLAIM:
+ return xfs_reclaim_igrab(ip, icw);
+ default:
+ return false;
+ }
+}
+
+/*
+ * Process an inode. Each processing function must handle any state changes
+ * made by the icwalk igrab function. Return -EAGAIN to skip an inode.
+ */
+static inline int
+xfs_icwalk_process_inode(
+ enum xfs_icwalk_goal goal,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ struct xfs_icwalk *icw)
+{
+ int error = 0;
+
+ switch (goal) {
+ case XFS_ICWALK_BLOCKGC:
+ error = xfs_blockgc_scan_inode(ip, icw);
+ break;
+ case XFS_ICWALK_RECLAIM:
+ xfs_reclaim_inode(ip, pag);
+ break;
+ }
+ return error;
+}
+
+/*
+ * For a given per-AG structure @pag and a goal, grab qualifying inodes and
+ * process them in some manner.
+ */
+static int
+xfs_icwalk_ag(
+ struct xfs_perag *pag,
+ enum xfs_icwalk_goal goal,
+ struct xfs_icwalk *icw)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ uint32_t first_index;
+ int last_error = 0;
+ int skipped;
+ bool done;
+ int nr_found;
+
+restart:
+ done = false;
+ skipped = 0;
+ if (goal == XFS_ICWALK_RECLAIM)
+ first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
+ else
+ first_index = 0;
+ nr_found = 0;
+ do {
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ int error = 0;
+ int i;
+
+ rcu_read_lock();
+
+ nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+ (void **) batch, first_index,
+ XFS_LOOKUP_BATCH, goal);
+ if (!nr_found) {
+ done = true;
+ rcu_read_unlock();
+ break;
+ }
+
+ /*
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
+ */
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || !xfs_icwalk_igrab(goal, ip, icw))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = true;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = xfs_icwalk_process_inode(goal, batch[i], pag,
+ icw);
+ if (error == -EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != -EFSCORRUPTED)
+ last_error = error;
+ }
+
+ /* bail out if the filesystem is corrupted. */
+ if (error == -EFSCORRUPTED)
+ break;
+
+ cond_resched();
+
+ if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
+ icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
+ if (icw->icw_scan_limit <= 0)
+ break;
+ }
+ } while (nr_found && !done);
+
+ if (goal == XFS_ICWALK_RECLAIM) {
+ if (done)
+ first_index = 0;
+ WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
+ }
+
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
+ return last_error;
+}
+
+/* Walk all incore inodes to achieve a given goal. */
+static int
+xfs_icwalk(
+ struct xfs_mount *mp,
+ enum xfs_icwalk_goal goal,
+ struct xfs_icwalk *icw)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t agno;
+
+ for_each_perag_tag(mp, agno, pag, goal) {
+ error = xfs_icwalk_ag(pag, goal, icw);
+ if (error) {
+ last_error = error;
+ if (error == -EFSCORRUPTED) {
+ xfs_perag_put(pag);
+ break;
+ }
+ }
+ }
+ return last_error;
+ BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
+}
+
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+ struct xfs_inode *ip,
+ int whichfork)
{
- return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+
+ if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+ return;
+ do {
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_warn(ip->i_mount,
+ "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+ ip->i_ino,
+ whichfork == XFS_DATA_FORK ? "data" : "cow",
+ got.br_startoff, got.br_blockcount);
+ }
+ } while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork) do { } while (0)
+#endif
+
+/* Schedule the inode for reclaim. */
+static void
+xfs_inodegc_set_reclaimable(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
+ xfs_check_delalloc(ip, XFS_DATA_FORK);
+ xfs_check_delalloc(ip, XFS_COW_FORK);
+ ASSERT(0);
+ }
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ trace_xfs_inode_set_reclaimable(ip);
+ ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Free all speculative preallocations and possibly even the inode itself.
+ * This is the last chance to make changes to an otherwise unreferenced file
+ * before incore reclamation happens.
+ */
+static void
+xfs_inodegc_inactivate(
+ struct xfs_inode *ip)
+{
+ trace_xfs_inode_inactivating(ip);
+ xfs_inactive(ip);
+ xfs_inodegc_set_reclaimable(ip);
}
void
-xfs_inode_set_cowblocks_tag(
- xfs_inode_t *ip)
+xfs_inodegc_worker(
+ struct work_struct *work)
{
- trace_xfs_inode_set_cowblocks_tag(ip);
- return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
- trace_xfs_perag_set_cowblocks,
- XFS_ICI_COWBLOCKS_TAG);
+ struct xfs_inodegc *gc = container_of(to_delayed_work(work),
+ struct xfs_inodegc, work);
+ struct llist_node *node = llist_del_all(&gc->list);
+ struct xfs_inode *ip, *n;
+
+ WRITE_ONCE(gc->items, 0);
+
+ if (!node)
+ return;
+
+ ip = llist_entry(node, struct xfs_inode, i_gclist);
+ trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+
+ WRITE_ONCE(gc->shrinker_hits, 0);
+ llist_for_each_entry_safe(ip, n, node, i_gclist) {
+ xfs_iflags_set(ip, XFS_INACTIVATING);
+ xfs_inodegc_inactivate(ip);
+ }
}
+/*
+ * Expedite all pending inodegc work to run immediately. This does not wait for
+ * completion of the work.
+ */
void
-xfs_inode_clear_cowblocks_tag(
- xfs_inode_t *ip)
+xfs_inodegc_push(
+ struct xfs_mount *mp)
{
- trace_xfs_inode_clear_cowblocks_tag(ip);
- return __xfs_inode_clear_blocks_tag(ip,
- trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
+ if (!xfs_is_inodegc_enabled(mp))
+ return;
+ trace_xfs_inodegc_push(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
}
-/* Disable post-EOF and CoW block auto-reclamation. */
+/*
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
+ */
void
-xfs_stop_block_reaping(
+xfs_inodegc_flush(
struct xfs_mount *mp)
{
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
+ xfs_inodegc_push(mp);
+ trace_xfs_inodegc_flush(mp, __return_address);
+ flush_workqueue(mp->m_inodegc_wq);
}
-/* Enable post-EOF and CoW block auto-reclamation. */
+/*
+ * Flush all the pending work and then disable the inode inactivation background
+ * workers and wait for them to stop.
+ */
void
-xfs_start_block_reaping(
+xfs_inodegc_stop(
struct xfs_mount *mp)
{
- xfs_queue_eofblocks(mp);
- xfs_queue_cowblocks(mp);
+ if (!xfs_clear_inodegc_enabled(mp))
+ return;
+
+ xfs_inodegc_queue_all(mp);
+ drain_workqueue(mp->m_inodegc_wq);
+
+ trace_xfs_inodegc_stop(mp, __return_address);
+}
+
+/*
+ * Enable the inode inactivation background workers and schedule deferred inode
+ * inactivation work if there is any.
+ */
+void
+xfs_inodegc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_set_inodegc_enabled(mp))
+ return;
+
+ trace_xfs_inodegc_start(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
+}
+
+#ifdef CONFIG_XFS_RT
+static inline bool
+xfs_inodegc_want_queue_rt_file(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (!XFS_IS_REALTIME_INODE(ip))
+ return false;
+
+ if (__percpu_counter_compare(&mp->m_frextents,
+ mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ return false;
+}
+#else
+# define xfs_inodegc_want_queue_rt_file(ip) (false)
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Schedule the inactivation worker when:
+ *
+ * - We've accumulated more than one inode cluster buffer's worth of inodes.
+ * - There is less than 5% free space left.
+ * - Any of the quotas for this inode are near an enforcement limit.
+ */
+static inline bool
+xfs_inodegc_want_queue_work(
+ struct xfs_inode *ip,
+ unsigned int items)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (items > mp->m_ino_geo.inodes_per_cluster)
+ return true;
+
+ if (__percpu_counter_compare(&mp->m_fdblocks,
+ mp->m_low_space[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ if (xfs_inodegc_want_queue_rt_file(ip))
+ return true;
+
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
+ return true;
+
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
+ return true;
+
+ if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
+ return true;
+
+ return false;
+}
+
+/*
+ * Upper bound on the number of inodes in each AG that can be queued for
+ * inactivation at any given time, to avoid monopolizing the workqueue.
+ */
+#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
+
+/*
+ * Make the frontend wait for inactivations when:
+ *
+ * - Memory shrinkers queued the inactivation worker and it hasn't finished.
+ * - The queue depth exceeds the maximum allowable percpu backlog.
+ *
+ * Note: If the current thread is running a transaction, we don't ever want to
+ * wait for other transactions because that could introduce a deadlock.
+ */
+static inline bool
+xfs_inodegc_want_flush_work(
+ struct xfs_inode *ip,
+ unsigned int items,
+ unsigned int shrinker_hits)
+{
+ if (current->journal_info)
+ return false;
+
+ if (shrinker_hits > 0)
+ return true;
+
+ if (items > XFS_INODEGC_MAX_BACKLOG)
+ return true;
+
+ return false;
+}
+
+/*
+ * Queue a background inactivation worker if there are inodes that need to be
+ * inactivated and higher level xfs code hasn't disabled the background
+ * workers.
+ */
+static void
+xfs_inodegc_queue(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_inodegc *gc;
+ int items;
+ unsigned int shrinker_hits;
+ unsigned long queue_delay = 1;
+
+ trace_xfs_inode_set_need_inactive(ip);
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags |= XFS_NEED_INACTIVE;
+ spin_unlock(&ip->i_flags_lock);
+
+ gc = get_cpu_ptr(mp->m_inodegc);
+ llist_add(&ip->i_gclist, &gc->list);
+ items = READ_ONCE(gc->items);
+ WRITE_ONCE(gc->items, items + 1);
+ shrinker_hits = READ_ONCE(gc->shrinker_hits);
+
+ /*
+ * We queue the work while holding the current CPU so that the work
+ * is scheduled to run on this CPU.
+ */
+ if (!xfs_is_inodegc_enabled(mp)) {
+ put_cpu_ptr(gc);
+ return;
+ }
+
+ if (xfs_inodegc_want_queue_work(ip, items))
+ queue_delay = 0;
+
+ trace_xfs_inodegc_queue(mp, __return_address);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+ put_cpu_ptr(gc);
+
+ if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
+ trace_xfs_inodegc_throttle(mp, __return_address);
+ flush_delayed_work(&gc->work);
+ }
+}
+
+/*
+ * Fold the dead CPU inodegc queue into the current CPUs queue.
+ */
+void
+xfs_inodegc_cpu_dead(
+ struct xfs_mount *mp,
+ unsigned int dead_cpu)
+{
+ struct xfs_inodegc *dead_gc, *gc;
+ struct llist_node *first, *last;
+ unsigned int count = 0;
+
+ dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
+ cancel_delayed_work_sync(&dead_gc->work);
+
+ if (llist_empty(&dead_gc->list))
+ return;
+
+ first = dead_gc->list.first;
+ last = first;
+ while (last->next) {
+ last = last->next;
+ count++;
+ }
+ dead_gc->list.first = NULL;
+ dead_gc->items = 0;
+
+ /* Add pending work to current CPU */
+ gc = get_cpu_ptr(mp->m_inodegc);
+ llist_add_batch(first, last, &gc->list);
+ count += READ_ONCE(gc->items);
+ WRITE_ONCE(gc->items, count);
+
+ if (xfs_is_inodegc_enabled(mp)) {
+ trace_xfs_inodegc_queue(mp, __return_address);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
+ }
+ put_cpu_ptr(gc);
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag. Once we get tag
+ * lookups on the radix tree, this inode flag can go away.
+ *
+ * We always use background reclaim here because even if the inode is clean, it
+ * still may be under IO and hence we have wait for IO completion to occur
+ * before we can reclaim the inode. The background reclaim path handles this
+ * more efficiently than we can here, so simply let background reclaim tear down
+ * all inodes.
+ */
+void
+xfs_inode_mark_reclaimable(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ bool need_inactive;
+
+ XFS_STATS_INC(mp, vn_reclaim);
+
+ /*
+ * We should never get here with any of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
+
+ need_inactive = xfs_inode_needs_inactive(ip);
+ if (need_inactive) {
+ xfs_inodegc_queue(ip);
+ return;
+ }
+
+ /* Going straight to reclaim, so drop the dquots. */
+ xfs_qm_dqdetach(ip);
+ xfs_inodegc_set_reclaimable(ip);
+}
+
+/*
+ * Register a phony shrinker so that we can run background inodegc sooner when
+ * there's memory pressure. Inactivation does not itself free any memory but
+ * it does make inodes reclaimable, which eventually frees memory.
+ *
+ * The count function, seek value, and batch value are crafted to trigger the
+ * scan function during the second round of scanning. Hopefully this means
+ * that we reclaimed enough memory that initiating metadata transactions won't
+ * make things worse.
+ */
+#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
+#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
+
+static unsigned long
+xfs_inodegc_shrinker_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
+ m_inodegc_shrinker);
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ if (!xfs_is_inodegc_enabled(mp))
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list))
+ return XFS_INODEGC_SHRINKER_COUNT;
+ }
+
+ return 0;
+}
+
+static unsigned long
+xfs_inodegc_shrinker_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp = container_of(shrink, struct xfs_mount,
+ m_inodegc_shrinker);
+ struct xfs_inodegc *gc;
+ int cpu;
+ bool no_items = true;
+
+ if (!xfs_is_inodegc_enabled(mp))
+ return SHRINK_STOP;
+
+ trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
+
+ for_each_online_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ if (!llist_empty(&gc->list)) {
+ unsigned int h = READ_ONCE(gc->shrinker_hits);
+
+ WRITE_ONCE(gc->shrinker_hits, h + 1);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
+ no_items = false;
+ }
+ }
+
+ /*
+ * If there are no inodes to inactivate, we don't want the shrinker
+ * to think there's deferred work to call us back about.
+ */
+ if (no_items)
+ return LONG_MAX;
+
+ return SHRINK_STOP;
+}
+
+/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
+int
+xfs_inodegc_register_shrinker(
+ struct xfs_mount *mp)
+{
+ struct shrinker *shrink = &mp->m_inodegc_shrinker;
+
+ shrink->count_objects = xfs_inodegc_shrinker_count;
+ shrink->scan_objects = xfs_inodegc_shrinker_scan;
+ shrink->seeks = 0;
+ shrink->flags = SHRINKER_NONSLAB;
+ shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+
+ return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 48f1fd2bb6ad..6cd180721659 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -9,25 +9,27 @@
struct xfs_mount;
struct xfs_perag;
-struct xfs_eofblocks {
- __u32 eof_flags;
- kuid_t eof_uid;
- kgid_t eof_gid;
- prid_t eof_prid;
- __u64 eof_min_file_size;
+struct xfs_icwalk {
+ __u32 icw_flags;
+ kuid_t icw_uid;
+ kgid_t icw_gid;
+ prid_t icw_prid;
+ __u64 icw_min_file_size;
+ long icw_scan_limit;
};
-#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
-#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
+/* Flags that reflect xfs_fs_eofblocks functionality. */
+#define XFS_ICWALK_FLAG_SYNC (1U << 0) /* sync/wait mode scan */
+#define XFS_ICWALK_FLAG_UID (1U << 1) /* filter by uid */
+#define XFS_ICWALK_FLAG_GID (1U << 2) /* filter by gid */
+#define XFS_ICWALK_FLAG_PRID (1U << 3) /* filter by project id */
+#define XFS_ICWALK_FLAG_MINFILESIZE (1U << 4) /* filter by min file size */
-/*
- * tags for inode radix tree
- */
-#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
- in xfs_inode_ag_iterator */
-#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
-#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
-#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */
+#define XFS_ICWALK_FLAGS_VALID (XFS_ICWALK_FLAG_SYNC | \
+ XFS_ICWALK_FLAG_UID | \
+ XFS_ICWALK_FLAG_GID | \
+ XFS_ICWALK_FLAG_PRID | \
+ XFS_ICWALK_FLAG_MINFILESIZE)
/*
* Flags for xfs_iget()
@@ -37,11 +39,6 @@ struct xfs_eofblocks {
#define XFS_IGET_DONTCACHE 0x4
#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */
-/*
- * flags for AG inode iterator
- */
-#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */
-
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
@@ -51,75 +48,39 @@ void xfs_inode_free(struct xfs_inode *ip);
void xfs_reclaim_worker(struct work_struct *work);
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+void xfs_reclaim_inodes(struct xfs_mount *mp);
+long xfs_reclaim_inodes_count(struct xfs_mount *mp);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, unsigned long nr_to_scan);
+
+void xfs_inode_mark_reclaimable(struct xfs_inode *ip);
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+ unsigned int iwalk_flags);
+int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
+int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
+void xfs_blockgc_flush_all(struct xfs_mount *mp);
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
-int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
-void xfs_eofblocks_worker(struct work_struct *);
-void xfs_queue_eofblocks(struct xfs_mount *);
void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
-int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
-void xfs_cowblocks_worker(struct work_struct *);
-void xfs_queue_cowblocks(struct xfs_mount *);
-
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags, void *args),
- int flags, void *args);
-int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags, void *args),
- int flags, void *args, int iter_flags);
-int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags, void *args),
- int flags, void *args, int tag);
-
-static inline int
-xfs_fs_eofblocks_from_user(
- struct xfs_fs_eofblocks *src,
- struct xfs_eofblocks *dst)
-{
- if (src->eof_version != XFS_EOFBLOCKS_VERSION)
- return -EINVAL;
-
- if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
- return -EINVAL;
-
- if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
- memchr_inv(src->pad64, 0, sizeof(src->pad64)))
- return -EINVAL;
-
- dst->eof_flags = src->eof_flags;
- dst->eof_prid = src->eof_prid;
- dst->eof_min_file_size = src->eof_min_file_size;
-
- dst->eof_uid = INVALID_UID;
- if (src->eof_flags & XFS_EOF_FLAGS_UID) {
- dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
- if (!uid_valid(dst->eof_uid))
- return -EINVAL;
- }
-
- dst->eof_gid = INVALID_GID;
- if (src->eof_flags & XFS_EOF_FLAGS_GID) {
- dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
- if (!gid_valid(dst->eof_gid))
- return -EINVAL;
- }
- return 0;
-}
+
+void xfs_blockgc_worker(struct work_struct *work);
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
-void xfs_stop_block_reaping(struct xfs_mount *mp);
-void xfs_start_block_reaping(struct xfs_mount *mp);
+void xfs_blockgc_stop(struct xfs_mount *mp);
+void xfs_blockgc_start(struct xfs_mount *mp);
+
+void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_push(struct xfs_mount *mp);
+void xfs_inodegc_flush(struct xfs_mount *mp);
+void xfs_inodegc_stop(struct xfs_mount *mp);
+void xfs_inodegc_start(struct xfs_mount *mp);
+void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
+int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 490fee22b878..b05314d48176 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -6,13 +6,21 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_icreate_item.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_ialloc.h"
+#include "xfs_trace.h"
-kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+struct kmem_cache *xfs_icreate_cache; /* inode create item */
static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
{
@@ -55,7 +63,8 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
+ kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
static const struct xfs_item_ops xfs_icreate_item_ops = {
@@ -89,7 +98,7 @@ xfs_icreate_log(
{
struct xfs_icreate_item *icp;
- icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
+ icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
@@ -107,3 +116,147 @@ xfs_icreate_log(
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags);
}
+
+static enum xlog_recover_reorder
+xlog_recover_icreate_reorder(
+ struct xlog_recover_item *item)
+{
+ /*
+ * Inode allocation buffers must be replayed before subsequent inode
+ * items try to modify those buffers. ICREATE items are the logical
+ * equivalent of logging a newly initialized inode buffer, so recover
+ * these at the same time that we recover logged buffers.
+ */
+ return XLOG_REORDER_BUFFER_LIST;
+}
+
+/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be initialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_icreate_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+ int bb_per_cluster;
+ int cancel_count;
+ int nbufs;
+ int i;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return -EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return -EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return -EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return -EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return -EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return -EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return -EINVAL;
+ }
+
+ /*
+ * The inode chunk is either full or sparse and we only support
+ * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
+ */
+ if (length != igeo->ialloc_blks &&
+ length != igeo->ialloc_min_blks) {
+ xfs_warn(log->l_mp,
+ "%s: unsupported chunk length", __func__);
+ return -EINVAL;
+ }
+
+ /* verify inode count is consistent with extent length */
+ if ((count >> mp->m_sb.sb_inopblog) != length) {
+ xfs_warn(log->l_mp,
+ "%s: inconsistent inode count and chunk length",
+ __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * The icreate transaction can cover multiple cluster buffers and these
+ * buffers could have been freed and reused. Check the individual
+ * buffers for cancellation so we don't overwrite anything written after
+ * a cancellation.
+ */
+ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ nbufs = length / igeo->blocks_per_cluster;
+ for (i = 0, cancel_count = 0; i < nbufs; i++) {
+ xfs_daddr_t daddr;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno,
+ agbno + i * igeo->blocks_per_cluster);
+ if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster))
+ cancel_count++;
+ }
+
+ /*
+ * We currently only use icreate for a single allocation at a time. This
+ * means we should expect either all or none of the buffers to be
+ * cancelled. Be conservative and skip replay if at least one buffer is
+ * cancelled, but warn the user that something is awry if the buffers
+ * are not consistent.
+ *
+ * XXX: This must be refined to only skip cancelled clusters once we use
+ * icreate for multiple chunk allocations.
+ */
+ ASSERT(!cancel_count || cancel_count == nbufs);
+ if (cancel_count) {
+ if (cancel_count != nbufs)
+ xfs_warn(mp,
+ "WARNING: partial inode chunk cancellation, skipped icreate.");
+ trace_xfs_log_recover_icreate_cancel(log, icl);
+ return 0;
+ }
+
+ trace_xfs_log_recover_icreate_recover(log, icl);
+ return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+ length, be32_to_cpu(icl->icl_gen));
+}
+
+const struct xlog_recover_item_ops xlog_icreate_item_ops = {
+ .item_type = XFS_LI_ICREATE,
+ .reorder = xlog_recover_icreate_reorder,
+ .commit_pass2 = xlog_recover_icreate_commit_pass2,
+};
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
index a50d0b01e15a..64992823108a 100644
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -12,7 +12,7 @@ struct xfs_icreate_item {
struct xfs_icreate_log ic_format;
};
-extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+extern struct kmem_cache *xfs_icreate_cache; /* inode create item */
void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t agbno, unsigned int count,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c5077e6326c7..aa303be11576 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -11,7 +11,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
@@ -21,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
+#include "xfs_iunlink_item.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -35,8 +35,10 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
+#include "xfs_ag.h"
+#include "xfs_log_priv.h"
-kmem_zone_t *xfs_inode_zone;
+struct kmem_cache *xfs_inode_cache;
/*
* Used in xfs_itruncate_extents(). This is the maximum number of extents
@@ -44,9 +46,9 @@ kmem_zone_t *xfs_inode_zone;
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
-STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
-STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_inode *);
/*
* helper function to extract extent size hint from inode
@@ -61,8 +63,8 @@ xfs_get_extsz_hint(
*/
if (xfs_is_always_cow_inode(ip))
return 0;
- if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
- return ip->i_d.di_extsize;
+ if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+ return ip->i_extsize;
if (XFS_IS_REALTIME_INODE(ip))
return ip->i_mount->m_sb.sb_rextsize;
return 0;
@@ -81,8 +83,8 @@ xfs_get_cowextsz_hint(
xfs_extlen_t a, b;
a = 0;
- if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
- a = ip->i_d.di_cowextsize;
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+ a = ip->i_cowextsize;
b = xfs_get_extsz_hint(ip);
a = max(a, b);
@@ -112,8 +114,7 @@ xfs_ilock_data_map_shared(
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
+ if (xfs_need_iread_extents(&ip->i_df))
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
return lock_mode;
@@ -125,16 +126,35 @@ xfs_ilock_attr_map_shared(
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
- (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
return lock_mode;
}
/*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
+ * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
+ * to set in lock_flags.
+ */
+static inline void
+xfs_lock_flags_assert(
+ uint lock_flags)
+{
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+}
+
+/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
- * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
+ * multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
*
* The 3 locks should always be ordered so that the IO lock is obtained first,
@@ -142,23 +162,23 @@ xfs_ilock_attr_map_shared(
*
* Basic locking order:
*
- * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
*
- * mmap_sem locking order:
+ * mmap_lock locking order:
*
- * i_rwsem -> page lock -> mmap_sem
- * mmap_sem -> i_mmap_lock -> page_lock
+ * i_rwsem -> page lock -> mmap_lock
+ * mmap_lock -> invalidate_lock -> page_lock
*
- * The difference in mmap_sem locking order mean that we cannot hold the
- * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
- * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_sem.
+ * The difference in mmap_lock locking order mean that we cannot hold the
+ * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
+ * can fault in pages during copy in/out (for buffered IO) or require the
+ * mmap_lock in get_user_pages() to map the user pages into the kernel address
+ * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
+ * fault because page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
- * taken in places where we need to invalidate the page cache in a race
+ * take both the i_rwsem and the invalidate_lock. These locks should *only* be
+ * both taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
*/
@@ -169,18 +189,7 @@ xfs_ilock(
{
trace_xfs_ilock(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
@@ -190,10 +199,13 @@ xfs_ilock(
XFS_IOLOCK_DEP(lock_flags));
}
- if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_MMAPLOCK_EXCL) {
+ down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+ down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock,
+ XFS_MMAPLOCK_DEP(lock_flags));
+ }
if (lock_flags & XFS_ILOCK_EXCL)
mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
@@ -220,18 +232,7 @@ xfs_ilock_nowait(
{
trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
@@ -242,10 +243,10 @@ xfs_ilock_nowait(
}
if (lock_flags & XFS_MMAPLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_mmaplock))
+ if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_mmaplock))
+ if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock))
goto out_undo_iolock;
}
@@ -260,9 +261,9 @@ xfs_ilock_nowait(
out_undo_mmaplock:
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -289,19 +290,7 @@ xfs_iunlock(
xfs_inode_t *ip,
uint lock_flags)
{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- ASSERT(lock_flags != 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -309,9 +298,9 @@ xfs_iunlock(
up_read(&VFS_I(ip)->i_rwsem);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrunlock_excl(&ip->i_mmaplock);
+ up_write(&VFS_I(ip)->i_mapping->invalidate_lock);
else if (lock_flags & XFS_MMAPLOCK_SHARED)
- mrunlock_shared(&ip->i_mmaplock);
+ up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_ILOCK_EXCL)
mrunlock_excl(&ip->i_lock);
@@ -337,7 +326,7 @@ xfs_ilock_demote(
if (lock_flags & XFS_ILOCK_EXCL)
mrdemote(&ip->i_lock);
if (lock_flags & XFS_MMAPLOCK_EXCL)
- mrdemote(&ip->i_mmaplock);
+ downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
if (lock_flags & XFS_IOLOCK_EXCL)
downgrade_write(&VFS_I(ip)->i_rwsem);
@@ -345,9 +334,29 @@ xfs_ilock_demote(
}
#if defined(DEBUG) || defined(XFS_WARN)
-int
+static inline bool
+__xfs_rwsem_islocked(
+ struct rw_semaphore *rwsem,
+ bool shared)
+{
+ if (!debug_locks)
+ return rwsem_is_locked(rwsem);
+
+ if (!shared)
+ return lockdep_is_held_type(rwsem, 0);
+
+ /*
+ * We are checking that the lock is held at least in shared
+ * mode but don't care that it might be held exclusively
+ * (i.e. shared | excl). Hence we check if the lock is held
+ * in any mode rather than an explicit shared mode.
+ */
+ return lockdep_is_held_type(rwsem, -1);
+}
+
+bool
xfs_isilocked(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
uint lock_flags)
{
if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
@@ -357,20 +366,17 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- if (!(lock_flags & XFS_MMAPLOCK_SHARED))
- return !!ip->i_mmaplock.mr_writer;
- return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+ (lock_flags & XFS_MMAPLOCK_SHARED));
}
- if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
- if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !debug_locks ||
- lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
- return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
+ if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
+ (lock_flags & XFS_IOLOCK_SHARED));
}
ASSERT(0);
- return 0;
+ return false;
}
#endif
@@ -397,10 +403,12 @@ xfs_lockdep_subclass_ok(
* parent locking. Care must be taken to ensure we don't overrun the subclass
* storage fields in the class mask we build.
*/
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
+static inline uint
+xfs_lock_inumorder(
+ uint lock_mode,
+ uint subclass)
{
- int class = 0;
+ uint class = 0;
ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
XFS_ILOCK_RTSUM)));
@@ -445,13 +453,16 @@ xfs_lock_inodes(
int inodes,
uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
+ int attempts = 0;
+ uint i;
+ int j;
+ bool try_lock;
struct xfs_log_item *lp;
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
* support an arbitrary depth of locking here, but absolute limits on
- * inodes depend on the the type of locking and the limits placed by
+ * inodes depend on the type of locking and the limits placed by
* lockdep annotations in xfs_lock_inumorder. These are all checked by
* the asserts.
*/
@@ -470,9 +481,9 @@ xfs_lock_inodes(
} else if (lock_mode & XFS_MMAPLOCK_EXCL)
ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
- try_lock = 0;
- i = 0;
again:
+ try_lock = false;
+ i = 0;
for (; i < inodes; i++) {
ASSERT(ips[i]);
@@ -487,7 +498,7 @@ again:
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
- try_lock++;
+ try_lock = true;
}
}
@@ -527,19 +538,15 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
}
- i = 0;
- try_lock = 0;
goto again;
}
}
/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * the mmaplock or the ilock, but not more than one type at a time. If we lock
- * more than one at a time, lockdep will report false positives saying we have
- * violated locking orders. The iolock must be double-locked separately since
- * we use i_rwsem for that. We now support taking one lock EXCL and the other
- * SHARED.
+ * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
+ * mmaplock must be double-locked separately since we use i_rwsem and
+ * invalidate_lock for that. We now support taking one lock EXCL and the
+ * other SHARED.
*/
void
xfs_lock_two_inodes(
@@ -548,8 +555,6 @@ xfs_lock_two_inodes(
struct xfs_inode *ip1,
uint ip1_mode)
{
- struct xfs_inode *temp;
- uint mode_temp;
int attempts = 0;
struct xfs_log_item *lp;
@@ -557,24 +562,13 @@ xfs_lock_two_inodes(
ASSERT(hweight32(ip1_mode) == 1);
ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ||
- !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-
+ ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+ ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
- temp = ip0;
- ip0 = ip1;
- ip1 = temp;
- mode_temp = ip0_mode;
- ip0_mode = ip1_mode;
- ip1_mode = mode_temp;
+ swap(ip0, ip1);
+ swap(ip0_mode, ip1_mode);
}
again:
@@ -598,83 +592,55 @@ xfs_lock_two_inodes(
}
}
-void
-__xfs_iflock(
+uint
+xfs_ip2xflags(
struct xfs_inode *ip)
{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wq_entry);
-}
-
-STATIC uint
-_xfs_dic2xflags(
- uint16_t di_flags,
- uint64_t di_flags2,
- bool has_attr)
-{
uint flags = 0;
- if (di_flags & XFS_DIFLAG_ANY) {
- if (di_flags & XFS_DIFLAG_REALTIME)
+ if (ip->i_diflags & XFS_DIFLAG_ANY) {
+ if (ip->i_diflags & XFS_DIFLAG_REALTIME)
flags |= FS_XFLAG_REALTIME;
- if (di_flags & XFS_DIFLAG_PREALLOC)
+ if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
flags |= FS_XFLAG_PREALLOC;
- if (di_flags & XFS_DIFLAG_IMMUTABLE)
+ if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
flags |= FS_XFLAG_IMMUTABLE;
- if (di_flags & XFS_DIFLAG_APPEND)
+ if (ip->i_diflags & XFS_DIFLAG_APPEND)
flags |= FS_XFLAG_APPEND;
- if (di_flags & XFS_DIFLAG_SYNC)
+ if (ip->i_diflags & XFS_DIFLAG_SYNC)
flags |= FS_XFLAG_SYNC;
- if (di_flags & XFS_DIFLAG_NOATIME)
+ if (ip->i_diflags & XFS_DIFLAG_NOATIME)
flags |= FS_XFLAG_NOATIME;
- if (di_flags & XFS_DIFLAG_NODUMP)
+ if (ip->i_diflags & XFS_DIFLAG_NODUMP)
flags |= FS_XFLAG_NODUMP;
- if (di_flags & XFS_DIFLAG_RTINHERIT)
+ if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
flags |= FS_XFLAG_RTINHERIT;
- if (di_flags & XFS_DIFLAG_PROJINHERIT)
+ if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
flags |= FS_XFLAG_PROJINHERIT;
- if (di_flags & XFS_DIFLAG_NOSYMLINKS)
+ if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
flags |= FS_XFLAG_NOSYMLINKS;
- if (di_flags & XFS_DIFLAG_EXTSIZE)
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
flags |= FS_XFLAG_EXTSIZE;
- if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
+ if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
flags |= FS_XFLAG_EXTSZINHERIT;
- if (di_flags & XFS_DIFLAG_NODEFRAG)
+ if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
flags |= FS_XFLAG_NODEFRAG;
- if (di_flags & XFS_DIFLAG_FILESTREAM)
+ if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
flags |= FS_XFLAG_FILESTREAM;
}
- if (di_flags2 & XFS_DIFLAG2_ANY) {
- if (di_flags2 & XFS_DIFLAG2_DAX)
+ if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
+ if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
flags |= FS_XFLAG_DAX;
- if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
flags |= FS_XFLAG_COWEXTSIZE;
}
- if (has_attr)
+ if (xfs_inode_has_attr_fork(ip))
flags |= FS_XFLAG_HASATTR;
-
return flags;
}
-uint
-xfs_ip2xflags(
- struct xfs_inode *ip)
-{
- struct xfs_icdinode *dic = &ip->i_d;
-
- return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
-}
-
/*
* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
* is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -683,9 +649,9 @@ xfs_ip2xflags(
*/
int
xfs_lookup(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t **ipp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ struct xfs_inode **ipp,
struct xfs_name *ci_name)
{
xfs_ino_t inum;
@@ -693,7 +659,7 @@ xfs_lookup(
trace_xfs_lookup(dp, name);
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ if (xfs_is_shutdown(dp->i_mount))
return -EIO;
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -714,69 +680,121 @@ out_unlock:
return error;
}
-/*
- * Allocate an inode on disk and return a copy of its in-core version.
- * The in-core inode is locked exclusively. Set mode, nlink, and rdev
- * appropriately within the inode. The uid and gid for the inode are
- * set according to the contents of the given cred structure.
- *
- * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget() to obtain the in-core
- * version of the allocated inode. Finally, fill in the inode and
- * log its initial contents. In this case, ialloc_context would be
- * set to NULL.
- *
- * If xfs_dialloc() does not have an available inode, it will replenish
- * its supply by doing an allocation. Since we can only do one
- * allocation within a transaction without deadlocks, we must commit
- * the current transaction before returning the inode itself.
- * In this case, therefore, we will set ialloc_context and return.
- * The caller should then commit the current transaction, start a new
- * transaction, and call xfs_ialloc() again to actually get the inode.
- *
- * To ensure that some other process does not grab the inode that
- * was allocated during the first call to xfs_ialloc(), this routine
- * also returns the [locked] bp pointing to the head of the freelist
- * as ialloc_context. The caller should hold this buffer across
- * the commit and pass it back into this routine on the second call.
- *
- * If we are allocating quota inodes, we do not have a parent inode
- * to attach to or associate with (i.e. pip == NULL) because they
- * are not linked into the directory structure - they are attached
- * directly to the superblock - and so have no parent.
- */
-static int
-xfs_ialloc(
- xfs_trans_t *tp,
- xfs_inode_t *pip,
- umode_t mode,
- xfs_nlink_t nlink,
- dev_t rdev,
- prid_t prid,
- xfs_buf_t **ialloc_context,
- xfs_inode_t **ipp)
+/* Propagate di_flags from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
{
- struct xfs_mount *mp = tp->t_mountp;
- xfs_ino_t ino;
- xfs_inode_t *ip;
- uint flags;
- int error;
- struct timespec64 tv;
- struct inode *inode;
+ unsigned int di_flags = 0;
+ xfs_failaddr_t failaddr;
+ umode_t mode = VFS_I(ip)->i_mode;
+
+ if (S_ISDIR(mode)) {
+ if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ ip->i_extsize = pip->i_extsize;
+ }
+ if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ } else if (S_ISREG(mode)) {
+ if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ xfs_has_realtime(ip->i_mount))
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ ip->i_extsize = pip->i_extsize;
+ }
+ }
+ if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
+ xfs_inherit_noatime)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
+ xfs_inherit_nodump)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
+ xfs_inherit_sync)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
+ xfs_inherit_nosymlinks)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+
+ ip->i_diflags |= di_flags;
+
+ /*
+ * Inode verifiers on older kernels only check that the extent size
+ * hint is an integer multiple of the rt extent size on realtime files.
+ * They did not check the hint alignment on a directory with both
+ * rtinherit and extszinherit flags set. If the misaligned hint is
+ * propagated from a directory into a new realtime file, new file
+ * allocations will fail due to math errors in the rt allocator and/or
+ * trip the verifiers. Validate the hint settings in the new file so
+ * that we don't let broken hints propagate.
+ */
+ failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
+ VFS_I(ip)->i_mode, ip->i_diflags);
+ if (failaddr) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ }
+}
- /*
- * Call the space management code to pick
- * the on-disk inode to be allocated.
- */
- error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode,
- ialloc_context, &ino);
- if (error)
- return error;
- if (*ialloc_context || ino == NULLFSINO) {
- *ipp = NULL;
- return 0;
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags2(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ xfs_failaddr_t failaddr;
+
+ if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = pip->i_cowextsize;
}
- ASSERT(*ialloc_context == NULL);
+ if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
+ ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+
+ /* Don't let invalid cowextsize hints propagate. */
+ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
+ VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
+ if (failaddr) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ }
+}
+
+/*
+ * Initialise a newly allocated inode and return the in-core inode to the
+ * caller locked exclusively.
+ */
+int
+xfs_init_new_inode(
+ struct user_namespace *mnt_userns,
+ struct xfs_trans *tp,
+ struct xfs_inode *pip,
+ xfs_ino_t ino,
+ umode_t mode,
+ xfs_nlink_t nlink,
+ dev_t rdev,
+ prid_t prid,
+ bool init_xattrs,
+ struct xfs_inode **ipp)
+{
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip;
+ unsigned int flags;
+ int error;
+ struct timespec64 tv;
+ struct inode *inode;
/*
* Protect against obviously corrupt allocation btree records. Later
@@ -791,36 +809,25 @@ xfs_ialloc(
}
/*
- * Get the in-core inode with the lock held exclusively.
- * This is because we're setting fields here we need
- * to prevent others from looking at until we're done.
+ * Get the in-core inode with the lock held exclusively to prevent
+ * others from looking at until we're done.
*/
- error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
- XFS_ILOCK_EXCL, &ip);
+ error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
if (error)
return error;
+
ASSERT(ip != NULL);
inode = VFS_I(ip);
-
- /*
- * We always convert v1 inodes to v2 now - we only support filesystems
- * with >= v2 inode capability, so there is no reason for ever leaving
- * an inode in v1 format.
- */
- if (ip->i_d.di_version == 1)
- ip->i_d.di_version = 2;
-
- inode->i_mode = mode;
set_nlink(inode, nlink);
- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
inode->i_rdev = rdev;
- ip->i_d.di_projid = prid;
+ ip->i_projid = prid;
- if (pip && XFS_INHERIT_GID(pip)) {
- ip->i_d.di_gid = pip->i_d.di_gid;
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
+ if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
+ inode_fsuid_set(inode, mnt_userns);
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = mode;
+ } else {
+ inode_init_owner(mnt_userns, inode, dir, mode);
}
/*
@@ -828,115 +835,66 @@ xfs_ialloc(
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if ((irix_sgid_inherit) &&
- (inode->i_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+ !vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
inode->i_mode &= ~S_ISGID;
- ip->i_d.di_size = 0;
- ip->i_d.di_nextents = 0;
- ASSERT(ip->i_d.di_nblocks == 0);
+ ip->i_disk_size = 0;
+ ip->i_df.if_nextents = 0;
+ ASSERT(ip->i_nblocks == 0);
tv = current_time(inode);
inode->i_mtime = tv;
inode->i_atime = tv;
inode->i_ctime = tv;
- ip->i_d.di_extsize = 0;
- ip->i_d.di_dmevmask = 0;
- ip->i_d.di_dmstate = 0;
- ip->i_d.di_flags = 0;
+ ip->i_extsize = 0;
+ ip->i_diflags = 0;
- if (ip->i_d.di_version == 3) {
+ if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
- ip->i_d.di_flags2 = 0;
- ip->i_d.di_cowextsize = 0;
- ip->i_d.di_crtime = tv;
+ ip->i_cowextsize = 0;
+ ip->i_crtime = tv;
}
-
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
case S_IFSOCK:
- ip->i_d.di_format = XFS_DINODE_FMT_DEV;
- ip->i_df.if_flags = 0;
+ ip->i_df.if_format = XFS_DINODE_FMT_DEV;
flags |= XFS_ILOG_DEV;
break;
case S_IFREG:
case S_IFDIR:
- if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint di_flags = 0;
-
- if (S_ISDIR(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_REALTIME;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSIZE;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- }
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
- xfs_inherit_noatime)
- di_flags |= XFS_DIFLAG_NOATIME;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
- xfs_inherit_nodump)
- di_flags |= XFS_DIFLAG_NODUMP;
- if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
- xfs_inherit_sync)
- di_flags |= XFS_DIFLAG_SYNC;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
- xfs_inherit_nosymlinks)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
- xfs_inherit_nodefrag)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
-
- ip->i_d.di_flags |= di_flags;
- }
- if (pip &&
- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
- pip->i_d.di_version == 3 &&
- ip->i_d.di_version == 3) {
- uint64_t di_flags2 = 0;
-
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
- }
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
-
- ip->i_d.di_flags2 |= di_flags2;
- }
- /* FALLTHROUGH */
+ if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
+ xfs_inode_inherit_flags(ip, pip);
+ if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
+ xfs_inode_inherit_flags2(ip, pip);
+ fallthrough;
case S_IFLNK:
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
- ip->i_df.if_flags = XFS_IFEXTENTS;
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_bytes = 0;
ip->i_df.if_u1.if_root = NULL;
break;
default:
ASSERT(0);
}
+
/*
- * Attribute fork settings for new inode.
+ * If we need to create attributes immediately after allocating the
+ * inode, initialise an empty attribute fork right now. We use the
+ * default fork offset for attributes here as we don't know exactly what
+ * size or how many attributes we might be adding. We can do this
+ * safely here because we know the data fork is completely empty and
+ * this saves us from needing to run a separate transaction to set the
+ * fork offset in the immediate future.
*/
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ip->i_d.di_anextents = 0;
+ if (init_xattrs && xfs_has_attr(mp)) {
+ ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+ }
/*
* Log the new values stuffed into the inode.
@@ -952,146 +910,6 @@ xfs_ialloc(
}
/*
- * Allocates a new inode from disk and return a pointer to the
- * incore copy. This routine will internally commit the current
- * transaction and allocate a new one if the Space Manager needed
- * to do an allocation to replenish the inode free-list.
- *
- * This routine is designed to be called from xfs_create and
- * xfs_create_dir.
- *
- */
-int
-xfs_dir_ialloc(
- xfs_trans_t **tpp, /* input: current transaction;
- output: may be a new transaction. */
- xfs_inode_t *dp, /* directory within whose allocate
- the inode. */
- umode_t mode,
- xfs_nlink_t nlink,
- dev_t rdev,
- prid_t prid, /* project id */
- xfs_inode_t **ipp) /* pointer to inode; it will be
- locked. */
-{
- xfs_trans_t *tp;
- xfs_inode_t *ip;
- xfs_buf_t *ialloc_context = NULL;
- int code;
- void *dqinfo;
- uint tflags;
-
- tp = *tpp;
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-
- /*
- * xfs_ialloc will return a pointer to an incore inode if
- * the Space Manager has an available inode on the free
- * list. Otherwise, it will do an allocation and replenish
- * the freelist. Since we can only do one allocation per
- * transaction without deadlocks, we will need to commit the
- * current transaction and start a new one. We will then
- * need to call xfs_ialloc again to get the inode.
- *
- * If xfs_ialloc did an allocation to replenish the freelist,
- * it returns the bp containing the head of the freelist as
- * ialloc_context. We will hold a lock on it across the
- * transaction commit so that no other process can steal
- * the inode(s) that we've just allocated.
- */
- code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context,
- &ip);
-
- /*
- * Return an error if we were unable to allocate a new inode.
- * This should only happen if we run out of space on disk or
- * encounter a disk error.
- */
- if (code) {
- *ipp = NULL;
- return code;
- }
- if (!ialloc_context && !ip) {
- *ipp = NULL;
- return -ENOSPC;
- }
-
- /*
- * If the AGI buffer is non-NULL, then we were unable to get an
- * inode in one operation. We need to commit the current
- * transaction and call xfs_ialloc() again. It is guaranteed
- * to succeed the second time.
- */
- if (ialloc_context) {
- /*
- * Normally, xfs_trans_commit releases all the locks.
- * We call bhold to hang on to the ialloc_context across
- * the commit. Holding this buffer prevents any other
- * processes from doing any allocations in this
- * allocation group.
- */
- xfs_trans_bhold(tp, ialloc_context);
-
- /*
- * We want the quota changes to be associated with the next
- * transaction, NOT this one. So, detach the dqinfo from this
- * and attach it to the next transaction.
- */
- dqinfo = NULL;
- tflags = 0;
- if (tp->t_dqinfo) {
- dqinfo = (void *)tp->t_dqinfo;
- tp->t_dqinfo = NULL;
- tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
- tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
- }
-
- code = xfs_trans_roll(&tp);
-
- /*
- * Re-attach the quota info that we detached from prev trx.
- */
- if (dqinfo) {
- tp->t_dqinfo = dqinfo;
- tp->t_flags |= tflags;
- }
-
- if (code) {
- xfs_buf_relse(ialloc_context);
- *tpp = tp;
- *ipp = NULL;
- return code;
- }
- xfs_trans_bjoin(tp, ialloc_context);
-
- /*
- * Call ialloc again. Since we've locked out all
- * other allocations in this allocation group,
- * this call should always succeed.
- */
- code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
- &ialloc_context, &ip);
-
- /*
- * If we get an error at this point, return to the caller
- * so that the current transaction can be aborted.
- */
- if (code) {
- *tpp = tp;
- *ipp = NULL;
- return code;
- }
- ASSERT(!ialloc_context && ip);
-
- }
-
- *ipp = ip;
- *tpp = tp;
-
- return 0;
-}
-
-/*
* Decrement the link count on an inode & log the change. If this causes the
* link count to go to zero, move the inode to AGI unlinked list so that it can
* be freed when the last active reference goes away via xfs_inactive().
@@ -1122,17 +940,18 @@ xfs_bumplink(
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
int
xfs_create(
+ struct user_namespace *mnt_userns,
xfs_inode_t *dp,
struct xfs_name *name,
umode_t mode,
dev_t rdev,
+ bool init_xattrs,
xfs_inode_t **ipp)
{
int is_dir = S_ISDIR(mode);
@@ -1147,10 +966,11 @@ xfs_create(
struct xfs_dquot *pdqp = NULL;
struct xfs_trans_res *tres;
uint resblks;
+ xfs_ino_t ino;
trace_xfs_create(dp, name);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1158,10 +978,10 @@ xfs_create(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+ mapped_fsgid(mnt_userns, &init_user_ns), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -1179,38 +999,35 @@ xfs_create(
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
+ resblks, &tp);
}
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
/*
- * Reserve disk quota and the inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
- if (error)
- goto out_trans_cancel;
-
- /*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
+ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ if (!error)
+ error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
+ is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
if (error)
goto out_trans_cancel;
/*
* Now we join the directory inode to the transaction. We do not do it
- * earlier because xfs_dir_ialloc might commit the previous transaction
+ * earlier because xfs_dialloc might commit the previous transaction
* (and release all the locks). An error from here on will result in
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
@@ -1219,8 +1036,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
@@ -1241,7 +1057,7 @@ xfs_create(
* create transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
/*
@@ -1274,7 +1090,7 @@ xfs_create(
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -1286,6 +1102,7 @@ xfs_create(
int
xfs_create_tmpfile(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
umode_t mode,
struct xfs_inode **ipp)
@@ -1300,8 +1117,9 @@ xfs_create_tmpfile(
struct xfs_dquot *pdqp = NULL;
struct xfs_trans_res *tres;
uint resblks;
+ xfs_ino_t ino;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1309,30 +1127,29 @@ xfs_create_tmpfile(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+ mapped_fsgid(mnt_userns, &init_user_ns), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
if (error)
return error;
resblks = XFS_IALLOC_SPACE_RES(mp);
tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
- if (error)
- goto out_release_inode;
-
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error)
- goto out_trans_cancel;
+ goto out_release_dquots;
- error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ if (!error)
+ error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode,
+ 0, 0, prid, false, &ip);
if (error)
goto out_trans_cancel;
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
/*
@@ -1369,7 +1186,7 @@ xfs_create_tmpfile(
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -1385,14 +1202,14 @@ xfs_link(
{
xfs_mount_t *mp = tdp->i_mount;
xfs_trans_t *tp;
- int error;
+ int error, nospace_error = 0;
int resblks;
trace_xfs_link(tdp, target_name);
ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(sip);
@@ -1404,26 +1221,18 @@ xfs_link(
goto std_return;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
- }
+ error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
+ &tp, &nospace_error);
if (error)
goto std_return;
- xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
* the tree quota mechanism could be circumvented.
*/
- if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- tdp->i_d.di_projid != sip->i_d.di_projid)) {
+ if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+ tdp->i_projid != sip->i_projid)) {
error = -EXDEV;
goto error_return;
}
@@ -1438,7 +1247,11 @@ xfs_link(
* Handle initial link state of O_TMPFILE inode
*/
if (VFS_I(sip)->i_nlink == 0) {
- error = xfs_iunlink_remove(tp, sip);
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
+ error = xfs_iunlink_remove(tp, pag, sip);
+ xfs_perag_put(pag);
if (error)
goto error_return;
}
@@ -1457,7 +1270,7 @@ xfs_link(
* link transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
@@ -1465,6 +1278,8 @@ xfs_link(
error_return:
xfs_trans_cancel(tp);
std_return:
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -1478,10 +1293,10 @@ xfs_itruncate_clear_reflink_flags(
if (!xfs_is_reflink_inode(ip))
return;
- dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
if (cfork->if_bytes == 0)
xfs_inode_clear_cowblocks_tag(ip);
}
@@ -1545,7 +1360,7 @@ xfs_itruncate_extents_flags(
* the page cache can't scale that far.
*/
first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- if (first_unmap_block >= XFS_MAX_FILEOFF) {
+ if (!xfs_verify_fileoff(mp, first_unmap_block)) {
WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
return 0;
}
@@ -1558,17 +1373,10 @@ xfs_itruncate_extents_flags(
if (error)
goto out;
- /*
- * Duplicate the transaction that has the permanent
- * reservation and commit the old transaction.
- */
+ /* free the just unmapped extents */
error = xfs_defer_finish(&tp);
if (error)
goto out;
-
- error = xfs_trans_roll_inode(&tp, ip);
- if (error)
- goto out;
}
if (whichfork == XFS_DATA_FORK) {
@@ -1599,16 +1407,16 @@ xfs_release(
xfs_inode_t *ip)
{
xfs_mount_t *mp = ip->i_mount;
- int error;
+ int error = 0;
if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
return 0;
/* If this is a read-only mount, don't do this (would generate I/O) */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return 0;
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (!xfs_is_shutdown(mp)) {
int truncated;
/*
@@ -1635,8 +1443,16 @@ xfs_release(
if (VFS_I(ip)->i_nlink == 0)
return 0;
- if (xfs_can_free_eofblocks(ip, false)) {
+ /*
+ * If we can't get the iolock just skip truncating the blocks past EOF
+ * because we could deadlock with the mmap_lock otherwise. We'll get
+ * another chance to drop them once the last reference to the inode is
+ * dropped, so we'll never leak blocks permanently.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
+ return 0;
+ if (xfs_can_free_eofblocks(ip, false)) {
/*
* Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding
@@ -1652,26 +1468,20 @@ xfs_release(
* place.
*/
if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
- return 0;
- /*
- * If we can't get the iolock just skip truncating the blocks
- * past EOF because we could deadlock with the mmap_sem
- * otherwise. We'll get another chance to drop them once the
- * last reference to the inode is dropped, so we'll never leak
- * blocks permanently.
- */
- if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- error = xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- if (error)
- return error;
- }
+ goto out_unlock;
+
+ error = xfs_free_eofblocks(ip);
+ if (error)
+ goto out_unlock;
/* delalloc blocks after truncation means it really is dirty */
if (ip->i_delayed_blks)
xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
}
- return 0;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
}
/*
@@ -1689,7 +1499,7 @@ xfs_inactive_truncate(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1700,14 +1510,14 @@ xfs_inactive_truncate(
* of a system crash before the truncate completes. See the related
* comment in xfs_vn_setattr_size() for details.
*/
- ip->i_d.di_size = 0;
+ ip->i_disk_size = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error)
goto error_trans_cancel;
- ASSERT(ip->i_d.di_nextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
error = xfs_trans_commit(tp);
if (error)
@@ -1760,28 +1570,48 @@ xfs_inactive_ifree(
"Failed to remove inode(s) from unlinked list. "
"Please free space, unmount and run xfs_repair.");
} else {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
}
return error;
}
+ /*
+ * We do not hold the inode locked across the entire rolling transaction
+ * here. We only need to hold it for the first transaction that
+ * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
+ * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
+ * here breaks the relationship between cluster buffer invalidation and
+ * stale inode invalidation on cluster buffer item journal commit
+ * completion, and can result in leaving dirty stale inodes hanging
+ * around in memory.
+ *
+ * We have no need for serialising this inode operation against other
+ * operations - we freed the inode and hence reallocation is required
+ * and that will serialise on reallocating the space the deferops need
+ * to free. Hence we can unlock the inode on the first commit of
+ * the transaction rather than roll it right through the deferops. This
+ * avoids relogging the XFS_ISTALE inode.
+ *
+ * We check that xfs_ifree() hasn't grown an internal transaction roll
+ * by asserting that the inode is still locked when it returns.
+ */
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_ifree(tp, ip);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (error) {
/*
* If we fail to free the inode, shut down. The cancel
* might do that, we need to make sure. Otherwise the
* inode might be lost for a long time or forever.
*/
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (!xfs_is_shutdown(mp)) {
xfs_notice(mp, "%s: xfs_ifree returned error %d",
__func__, error);
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
}
xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1799,11 +1629,63 @@ xfs_inactive_ifree(
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
/*
+ * Returns true if we need to update the on-disk metadata before we can free
+ * the memory used by this inode. Updates include freeing post-eof
+ * preallocations; freeing COW staging extents; and marking the inode free in
+ * the inobt if it is on the unlinked list.
+ */
+bool
+xfs_inode_needs_inactive(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+
+ /*
+ * If the inode is already free, then there can be nothing
+ * to clean up here.
+ */
+ if (VFS_I(ip)->i_mode == 0)
+ return false;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (xfs_is_readonly(mp))
+ return false;
+
+ /* If the log isn't running, push inodes straight to reclaim. */
+ if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp))
+ return false;
+
+ /* Metadata inodes require explicit resource cleanup. */
+ if (xfs_is_metadata_inode(ip))
+ return false;
+
+ /* Want to clean out the cow blocks if there are any. */
+ if (cow_ifp && cow_ifp->if_bytes > 0)
+ return true;
+
+ /* Unlinked files must be freed. */
+ if (VFS_I(ip)->i_nlink == 0)
+ return true;
+
+ /*
+ * This file isn't being freed, so check if there are post-eof blocks
+ * to free. @force is true because we are evicting an inode from the
+ * cache. Post-eof blocks must be freed, lest we end up with broken
+ * free space accounting.
+ *
+ * Note: don't bother with iolock here since lockdep complains about
+ * acquiring it in reclaim context. We have the only reference to the
+ * inode at this point anyways.
+ */
+ return xfs_can_free_eofblocks(ip, true);
+}
+
+/*
* xfs_inactive
*
* This is called when the vnode reference count for the vnode
@@ -1825,15 +1707,19 @@ xfs_inactive(
*/
if (VFS_I(ip)->i_mode == 0) {
ASSERT(ip->i_df.if_broot_bytes == 0);
- return;
+ goto out;
}
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
/* If this is a read-only mount, don't do this (would generate I/O) */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return;
+ if (xfs_is_readonly(mp))
+ goto out;
+
+ /* Metadata inodes require explicit resource cleanup. */
+ if (xfs_is_metadata_inode(ip))
+ goto out;
/* Try to clean out the cow blocks if there are any. */
if (xfs_inode_has_cow_data(ip))
@@ -1852,49 +1738,47 @@ xfs_inactive(
if (xfs_can_free_eofblocks(ip, true))
xfs_free_eofblocks(ip);
- return;
+ goto out;
}
if (S_ISREG(VFS_I(ip)->i_mode) &&
- (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
- ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+ (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
+ ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
error = xfs_qm_dqattach(ip);
if (error)
- return;
+ goto out;
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
error = xfs_inactive_truncate(ip);
if (error)
- return;
+ goto out;
/*
* If there are attributes associated with the file then blow them away
* now. The code calls a routine that recursively deconstructs the
* attribute fork. If also blows away the in-core attribute fork.
*/
- if (XFS_IFORK_Q(ip)) {
+ if (xfs_inode_has_attr_fork(ip)) {
error = xfs_attr_inactive(ip);
if (error)
- return;
+ goto out;
}
- ASSERT(!ip->i_afp);
- ASSERT(ip->i_d.di_anextents == 0);
- ASSERT(ip->i_d.di_forkoff == 0);
+ ASSERT(ip->i_forkoff == 0);
/*
* Free the inode.
*/
- error = xfs_inactive_ifree(ip);
- if (error)
- return;
+ xfs_inactive_ifree(ip);
+out:
/*
- * Release the dquots held by inode, if any.
+ * We're done making metadata updates for this inode, so we can release
+ * the attached dquots.
*/
xfs_qm_dqdetach(ip);
}
@@ -1916,195 +1800,69 @@ xfs_inactive(
* because we must walk that list to find the inode that points to the inode
* being removed from the unlinked hash bucket list.
*
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk. That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
*
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk. The only errors that
- * should bubble out are for obviously incorrect situations.
- *
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
*/
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
- struct rhash_head iu_rhash_head;
- xfs_agino_t iu_agino; /* X */
- xfs_agino_t iu_next_unlinked; /* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
- struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const xfs_agino_t *key = arg->key;
- const struct xfs_iunlink *iu = obj;
-
- if (iu->iu_next_unlinked != *key)
- return 1;
- return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
- .min_size = XFS_AGI_UNLINKED_BUCKETS,
- .key_len = sizeof(xfs_agino_t),
- .key_offset = offsetof(struct xfs_iunlink,
- iu_next_unlinked),
- .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
- .automatic_shrinking = true,
- .obj_cmpfn = xfs_iunlink_obj_cmpfn,
-};
-
/*
- * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
- * relation is found.
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode as we have existence guarantees by holding the AGI buffer lock and that
+ * only unlinked, referenced inodes can be on the unlinked inode list. If we
+ * don't find the inode in cache, then let the caller handle the situation.
*/
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup(
struct xfs_perag *pag,
xfs_agino_t agino)
{
- struct xfs_iunlink *iu;
-
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- return iu ? iu->iu_agino : NULLAGINO;
-}
+ struct xfs_inode *ip;
-/*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
- */
-static int
-xfs_iunlink_insert_backref(
- struct xfs_perag *pag,
- struct xfs_iunlink *iu)
-{
- int error;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
- error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
/*
- * Fail loudly if there already was an entry because that's a sign of
- * corruption of in-memory data. Also fail loudly if we see an error
- * code we didn't anticipate from the rhashtable code. Currently we
- * only anticipate ENOMEM.
+ * Inode not in memory or in RCU freeing limbo should not happen.
+ * Warn about this and let the caller handle the failure.
*/
- if (error) {
- WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
- kmem_free(iu);
+ if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ rcu_read_unlock();
+ return NULL;
}
- /*
- * Absorb any runtime errors that aren't a result of corruption because
- * this is a cache and we can always fall back to bucket list scanning.
- */
- if (error != 0 && error != -EEXIST)
- error = 0;
- return error;
+ ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
+ rcu_read_unlock();
+ return ip;
}
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
+/* Update the prev pointer of the next agino. */
static int
-xfs_iunlink_add_backref(
+xfs_iunlink_update_backref(
struct xfs_perag *pag,
xfs_agino_t prev_agino,
- xfs_agino_t this_agino)
-{
- struct xfs_iunlink *iu;
-
- if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
- return 0;
-
- iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
- iu->iu_agino = prev_agino;
- iu->iu_next_unlinked = this_agino;
-
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
- struct xfs_perag *pag,
- xfs_agino_t agino,
- xfs_agino_t next_unlinked)
+ xfs_agino_t next_agino)
{
- struct xfs_iunlink *iu;
- int error;
+ struct xfs_inode *ip;
- /* Look up the old entry; if there wasn't one then exit. */
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- if (!iu)
+ /* No update necessary if we are at the end of the list. */
+ if (next_agino == NULLAGINO)
return 0;
- /*
- * Remove the entry. This shouldn't ever return an error, but if we
- * couldn't remove the old entry we don't want to add it again to the
- * hash table, and if the entry disappeared on us then someone's
- * violated the locking rules and we need to fail loudly. Either way
- * we cannot remove the inode because internal state is or would have
- * been corrupt.
- */
- error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
- if (error)
- return error;
-
- /* If there is no new next entry just free our item and return. */
- if (next_unlinked == NULLAGINO) {
- kmem_free(iu);
- return 0;
- }
-
- /* Update the entry and re-add it to the hash table. */
- iu->iu_next_unlinked = next_unlinked;
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
- struct xfs_perag *pag)
-{
- return rhashtable_init(&pag->pagi_unlinked_hash,
- &xfs_iunlink_hash_params);
-}
-
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
- void *ptr,
- void *arg)
-{
- struct xfs_iunlink *iu = ptr;
- bool *freed_anything = arg;
-
- *freed_anything = true;
- kmem_free(iu);
-}
-
-void
-xfs_iunlink_destroy(
- struct xfs_perag *pag)
-{
- bool freed_anything = false;
-
- rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
- xfs_iunlink_free_item, &freed_anything);
-
- ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+ ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!ip)
+ return -EFSCORRUPTED;
+ ip->i_prev_unlinked = prev_agino;
+ return 0;
}
/*
@@ -2114,19 +1872,19 @@ xfs_iunlink_destroy(
STATIC int
xfs_iunlink_update_bucket(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_buf *agibp,
unsigned int bucket_index,
xfs_agino_t new_agino)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t old_value;
int offset;
- ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+ ASSERT(xfs_verify_agino_or_null(pag, new_agino));
old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
old_value, new_agino);
/*
@@ -2135,7 +1893,7 @@ xfs_iunlink_update_bucket(
* head of the list.
*/
if (old_value == new_agino) {
- xfs_buf_corruption_error(agibp);
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
}
@@ -2146,121 +1904,20 @@ xfs_iunlink_update_bucket(
return 0;
}
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- struct xfs_buf *ibp,
- struct xfs_dinode *dip,
- struct xfs_imap *imap,
- xfs_agino_t next_agino)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int offset;
-
- ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
-
- trace_xfs_iunlink_update_dinode(mp, agno, agino,
- be32_to_cpu(dip->di_next_unlinked), next_agino);
-
- dip->di_next_unlinked = cpu_to_be32(next_agino);
- offset = imap->im_boffset +
- offsetof(struct xfs_dinode, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
- xfs_inobp_check(mp, ibp);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-STATIC int
-xfs_iunlink_update_inode(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- xfs_agnumber_t agno,
- xfs_agino_t next_agino,
- xfs_agino_t *old_next_agino)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_dinode *dip;
- struct xfs_buf *ibp;
- xfs_agino_t old_value;
- int error;
-
- ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
-
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
- if (error)
- return error;
-
- /* Make sure the old pointer isn't garbage. */
- old_value = be32_to_cpu(dip->di_next_unlinked);
- if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
- sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- goto out;
- }
-
- /*
- * Since we're updating a linked list, we should never find that the
- * current pointer is the same as the new value, unless we're
- * terminating the list.
- */
- *old_next_agino = old_value;
- if (old_value == next_agino) {
- if (next_agino != NULLAGINO) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
- dip, sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- }
- goto out;
- }
-
- /* Ok, update the new pointer. */
- xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
- ibp, dip, &ip->i_imap, next_agino);
- return 0;
-out:
- xfs_trans_brelse(tp, ibp);
- return error;
-}
-
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI. It will be pulled from this
- * list when the inode is freed.
- */
-STATIC int
-xfs_iunlink(
+static int
+xfs_iunlink_insert_inode(
struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi;
- struct xfs_buf *agibp;
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t next_agino;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
- ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(VFS_I(ip)->i_mode != 0);
- trace_xfs_iunlink(ip);
-
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, agno, &agibp);
- if (error)
- return error;
- agi = XFS_BUF_TO_AGI(agibp);
-
/*
* Get the index into the agi hash table for the list this inode will
* go on. Make sure the pointer isn't garbage and that this inode
@@ -2268,152 +1925,132 @@ xfs_iunlink(
*/
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, agno, next_agino)) {
- xfs_buf_corruption_error(agibp);
+ !xfs_verify_agino_or_null(pag, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
}
- if (next_agino != NULLAGINO) {
- struct xfs_perag *pag;
- xfs_agino_t old_agino;
+ /*
+ * Update the prev pointer in the next inode to point back to this
+ * inode.
+ */
+ error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error)
+ return error;
+ if (next_agino != NULLAGINO) {
/*
* There is already another inode in the bucket, so point this
* inode to the current head of the list.
*/
- error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
- &old_agino);
- if (error)
- return error;
- ASSERT(old_agino == NULLAGINO);
-
- /*
- * agino has been unlinked, add a backref from the next inode
- * back to agino.
- */
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- xfs_perag_put(pag);
+ error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
if (error)
return error;
+ ip->i_next_unlinked = next_agino;
}
/* Point the head of the list to point to this inode. */
- return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+ return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
-/* Return the imap, dinode pointer, and buffer for an inode. */
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
+ */
STATIC int
-xfs_iunlink_map_ino(
+xfs_iunlink(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_buf *agibp;
int error;
- imap->im_blkno = 0;
- error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- return 0;
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(pag, tp, &agibp);
+ if (error)
+ goto out;
+
+ error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+ xfs_perag_put(pag);
+ return error;
}
-/*
- * Walk the unlinked chain from @head_agino until we find the inode that
- * points to @target_agino. Return the inode number, map, dinode pointer,
- * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
- *
- * @tp, @pag, @head_agino, and @target_agino are input parameters.
- * @agino, @imap, @dipp, and @bpp are all output parameters.
- *
- * Do not call this function if @target_agino is the head of the list.
- */
-STATIC int
-xfs_iunlink_map_prev(
+static int
+xfs_iunlink_remove_inode(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agino_t head_agino,
- xfs_agino_t target_agino,
- xfs_agino_t *agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp,
- struct xfs_perag *pag)
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- xfs_agino_t next_agino;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
- ASSERT(head_agino != target_agino);
- *bpp = NULL;
-
- /* See if our backref cache can find it faster. */
- *agino = xfs_iunlink_lookup_backref(pag, target_agino);
- if (*agino != NULLAGINO) {
- error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
- if (error)
- return error;
-
- if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
- return 0;
+ trace_xfs_iunlink_remove(ip);
- /*
- * If we get here the cache contents were corrupt, so drop the
- * buffer and fall back to walking the bucket list.
- */
- xfs_trans_brelse(tp, *bpp);
- *bpp = NULL;
- WARN_ON_ONCE(1);
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
+ */
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(pag, head_agino)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ agi, sizeof(*agi));
+ return -EFSCORRUPTED;
}
- trace_xfs_iunlink_map_prev_fallback(mp, agno);
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
+ if (error)
+ return error;
- /* Otherwise, walk the entire bucket until we find it. */
- next_agino = head_agino;
- while (next_agino != target_agino) {
- xfs_agino_t unlinked_agino;
+ /*
+ * Update the prev pointer in the next inode to point back to previous
+ * inode in the chain.
+ */
+ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
+ if (error)
+ return error;
- if (*bpp)
- xfs_trans_brelse(tp, *bpp);
+ if (head_agino != agino) {
+ struct xfs_inode *prev_ip;
- *agino = next_agino;
- error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
- bpp);
- if (error)
- return error;
+ prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+ if (!prev_ip)
+ return -EFSCORRUPTED;
- unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
- /*
- * Make sure this pointer is valid and isn't an obvious
- * infinite loop.
- */
- if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
- next_agino == unlinked_agino) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- *dipp, sizeof(**dipp));
- error = -EFSCORRUPTED;
- return error;
- }
- next_agino = unlinked_agino;
+ error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+ ip->i_next_unlinked);
+ prev_ip->i_next_unlinked = ip->i_next_unlinked;
+ } else {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+ ip->i_next_unlinked);
}
- return 0;
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
+ return error;
}
/*
@@ -2422,105 +2059,116 @@ xfs_iunlink_map_prev(
STATIC int
xfs_iunlink_remove(
struct xfs_trans *tp,
+ struct xfs_perag *pag,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi;
struct xfs_buf *agibp;
- struct xfs_buf *last_ibp;
- struct xfs_dinode *last_dip = NULL;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- xfs_agino_t next_agino;
- xfs_agino_t head_agino;
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
trace_xfs_iunlink_remove(ip);
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, agno, &agibp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the head pointer isn't garbage.
- */
- head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (!xfs_verify_agino(mp, agno, head_agino)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- agi, sizeof(*agi));
- return -EFSCORRUPTED;
+ return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
+}
+
+/*
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
+ */
+static void
+xfs_ifree_mark_inode_stale(
+ struct xfs_perag *pag,
+ struct xfs_inode *free_ip,
+ xfs_ino_t inum)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode_log_item *iip;
+ struct xfs_inode *ip;
+
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip) {
+ rcu_read_unlock();
+ return;
}
/*
- * Set our inode's next_unlinked pointer to NULL and then return
- * the old pointer value so that we can update whatever was previous
- * to us in the list to point to whatever was next in the list.
+ * because this is an RCU protected lookup, we could find a recently
+ * freed or even reallocated inode during the lookup. We need to check
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
+ * valid, the wrong inode or stale.
*/
- error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
- if (error)
- return error;
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
+ goto out_iflags_unlock;
/*
- * If there was a backref pointing from the next inode back to this
- * one, remove it because we've removed this inode from the list.
- *
- * Later, if this inode was in the middle of the list we'll update
- * this inode's backref to point from the next inode.
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+ * other inodes that we did not find in the list attached to the buffer
+ * and are not already marked stale. If we can't lock it, back off and
+ * retry.
*/
- if (next_agino != NULLAGINO) {
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_change_backref(pag, next_agino,
- NULLAGINO);
- if (error)
- goto out;
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
}
+ ip->i_flags |= XFS_ISTALE;
- if (head_agino == agino) {
- /* Point the head of the list to the next unlinked inode. */
- error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
- next_agino);
- if (error)
- goto out;
- } else {
- struct xfs_imap imap;
- xfs_agino_t prev_agino;
+ /*
+ * If the inode is flushing, it is already attached to the buffer. All
+ * we needed to do here is mark the inode stale so buffer IO completion
+ * will remove it from the AIL.
+ */
+ iip = ip->i_itemp;
+ if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
+ ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(iip->ili_last_fields);
+ goto out_iunlock;
+ }
- if (!pag)
- pag = xfs_perag_get(mp, agno);
+ /*
+ * Inodes not attached to the buffer can be released immediately.
+ * Everything else has to go through xfs_iflush_abort() on journal
+ * commit as the flock synchronises removal of the inode from the
+ * cluster buffer against inode reclaim.
+ */
+ if (!iip || list_empty(&iip->ili_item.li_bio_list))
+ goto out_iunlock;
- /* We need to search the list for the inode being freed. */
- error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
- &prev_agino, &imap, &last_dip, &last_ibp,
- pag);
- if (error)
- goto out;
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
- /* Point the previous inode on the list to the next inode. */
- xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
- last_dip, &imap, next_agino);
+ /* we have a dirty inode in memory that has not yet been flushed. */
+ spin_lock(&iip->ili_lock);
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ spin_unlock(&iip->ili_lock);
+ ASSERT(iip->ili_last_fields);
- /*
- * Now we deal with the backref for this inode. If this inode
- * pointed at a real inode, change the backref that pointed to
- * us to point to our old next. If this inode was the end of
- * the list, delete the backref that pointed to us. Note that
- * change_backref takes care of deleting the backref if
- * next_agino is NULLAGINO.
- */
- error = xfs_iunlink_change_backref(pag, agino, next_agino);
- if (error)
- goto out;
- }
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return;
-out:
- if (pag)
- xfs_perag_put(pag);
- return error;
+out_iunlock:
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_iflags_unlock:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
}
/*
@@ -2528,28 +2176,23 @@ out:
* inodes that are in memory - they all must be marked stale and attached to
* the cluster buffer.
*/
-STATIC int
+static int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *free_ip,
struct xfs_icluster *xic)
{
- xfs_mount_t *mp = free_ip->i_mount;
+ struct xfs_mount *mp = free_ip->i_mount;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ xfs_ino_t inum = xic->first_ino;
int nbufs;
int i, j;
int ioffset;
- xfs_daddr_t blkno;
- xfs_buf_t *bp;
- xfs_inode_t *ip;
- xfs_inode_log_item_t *iip;
- struct xfs_log_item *lip;
- struct xfs_perag *pag;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_ino_t inum;
int error;
- inum = xic->first_ino;
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2569,8 +2212,9 @@ xfs_ifree_cluster(
/*
* We obtain and lock the backing buffer first in the process
- * here, as we have to ensure that any dirty inode that we
- * can't get the flush lock on is attached to the buffer.
+ * here to ensure dirty inodes attached to the buffer remain in
+ * the flushing state while we mark them stale.
+ *
* If we scan the in-memory inodes first, then buffer IO can
* complete before we get a lock on it, and hence we may fail
* to mark all the active inodes on the buffer stale.
@@ -2593,196 +2237,84 @@ xfs_ifree_cluster(
bp->b_ops = &xfs_inode_buf_ops;
/*
- * Walk the inodes already attached to the buffer and mark them
- * stale. These will all have the flush locks held, so an
- * in-memory inode walk can't lock them. By marking them all
- * stale first, we will not attempt to lock them in the loop
- * below as the XFS_ISTALE flag will be set.
+ * Now we need to set all the cached clean inodes as XFS_ISTALE,
+ * too. This requires lookups, and will skip inodes that we've
+ * already marked XFS_ISTALE.
*/
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = xfs_istale_done;
- xfs_trans_ail_copy_lsn(mp->m_ail,
- &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- }
- }
-
-
- /*
- * For each inode in memory attempt to add it to the inode
- * buffer and set it up for being staled on buffer IO
- * completion. This is safe as we've locked out tail pushing
- * and flushing by locking the buffer.
- *
- * We have already marked every inode that was part of a
- * transaction stale above, which means there is no point in
- * even trying to lock them.
- */
- for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, (inum + i)));
-
- /* Inode not in memory, nothing to do */
- if (!ip) {
- rcu_read_unlock();
- continue;
- }
-
- /*
- * because this is an RCU protected lookup, we could
- * find a recently freed or even reallocated inode
- * during the lookup. We need to check under the
- * i_flags_lock for a valid inode here. Skip it if it
- * is not valid, the wrong inode or stale.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum + i ||
- __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- continue;
- }
- spin_unlock(&ip->i_flags_lock);
-
- /*
- * Don't try to lock/unlock the current inode, but we
- * _cannot_ skip the other inodes that we did not find
- * in the list attached to the buffer and are not
- * already marked stale. If we can't lock it, back off
- * and retry.
- */
- if (ip != free_ip) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
- }
-
- /*
- * Check the inode number again in case we're
- * racing with freeing in xfs_reclaim_inode().
- * See the comments in that function for more
- * information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum + i) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- rcu_read_unlock();
- continue;
- }
- }
- rcu_read_unlock();
-
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
-
- /*
- * we don't need to attach clean inodes or those only
- * with unlogged changes (which we throw away, anyway).
- */
- iip = ip->i_itemp;
- if (!iip || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
- iip->ili_last_fields = iip->ili_fields;
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
-
- xfs_buf_attach_iodone(bp, xfs_istale_done,
- &iip->ili_item);
-
- if (ip != free_ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ for (i = 0; i < igeo->inodes_per_cluster; i++)
+ xfs_ifree_mark_inode_stale(pag, free_ip, inum + i);
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
-
- xfs_perag_put(pag);
return 0;
}
/*
- * Free any local-format buffers sitting around before we reset to
- * extents format.
- */
-static inline void
-xfs_ifree_local_data(
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_ifork *ifp;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-}
-
-/*
- * This is called to return an inode to the inode free list.
- * The inode should already be truncated to 0 length and have
- * no pages associated with it. This routine also assumes that
- * the inode is already a part of the transaction.
+ * This is called to return an inode to the inode free list. The inode should
+ * already be truncated to 0 length and have no pages associated with it. This
+ * routine also assumes that the inode is already a part of the transaction.
*
- * The on-disk copy of the inode will have been added to the list
- * of unlinked inodes in the AGI. We need to remove the inode from
- * that list atomically with respect to freeing it here.
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
*/
int
xfs_ifree(
struct xfs_trans *tp,
struct xfs_inode *ip)
{
- int error;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
struct xfs_icluster xic = { 0 };
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(ip->i_d.di_nextents == 0);
- ASSERT(ip->i_d.di_anextents == 0);
- ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
- ASSERT(ip->i_d.di_nblocks == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
+ ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
+ ASSERT(ip->i_nblocks == 0);
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/*
- * Pull the on-disk inode from the AGI unlinked list.
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
*/
- error = xfs_iunlink_remove(tp, ip);
+ error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
- return error;
+ goto out;
- error = xfs_difree(tp, ip->i_ino, &xic);
+ error = xfs_iunlink_remove(tp, pag, ip);
if (error)
- return error;
+ goto out;
- xfs_ifree_local_data(ip, XFS_DATA_FORK);
- xfs_ifree_local_data(ip, XFS_ATTR_FORK);
+ /*
+ * Free any local-format data sitting around before we reset the
+ * data fork to extents format. Note that the attr fork data has
+ * already been freed by xfs_attr_inactive.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ kmem_free(ip->i_df.if_u1.if_data);
+ ip->i_df.if_u1.if_data = NULL;
+ ip->i_df.if_bytes = 0;
+ }
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
- ip->i_d.di_flags = 0;
- ip->i_d.di_flags2 = 0;
- ip->i_d.di_dmevmask = 0;
- ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ ip->i_diflags = 0;
+ ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
+ ip->i_forkoff = 0; /* mark the attr fork not in use */
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+ if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
+ xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
/* Don't attempt to replay owner changes for a deleted inode */
- ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+ spin_lock(&iip->ili_lock);
+ iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+ spin_unlock(&iip->ili_lock);
/*
* Bump the generation count so no one will be confused
@@ -2792,8 +2324,9 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xic.deleted)
- error = xfs_ifree_cluster(ip, tp, &xic);
-
+ error = xfs_ifree_cluster(tp, pag, ip, &xic);
+out:
+ xfs_perag_put(pag);
return error;
}
@@ -2811,7 +2344,7 @@ xfs_iunpin(
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
+ xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
}
@@ -2876,12 +2409,13 @@ xfs_remove(
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
+ int dontcare;
int error = 0;
uint resblks;
trace_xfs_remove(dp, name);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(dp);
@@ -2893,31 +2427,24 @@ xfs_remove(
goto std_return;
/*
- * We try to get the real space reservation first,
- * allowing for directory btree deletion(s) implying
- * possible bmap insert(s). If we can't get the space
- * reservation then we use 0 instead, and avoid the bmap
- * btree insert(s) in the directory code by, if the bmap
- * insert tries to happen, instead trimming the LAST
- * block from the directory.
+ * We try to get the real space reservation first, allowing for
+ * directory btree deletion(s) implying possible bmap insert(s). If we
+ * can't get the space reservation then we use 0 instead, and avoid the
+ * bmap btree insert(s) in the directory code by, if the bmap insert
+ * tries to happen, instead trimming the LAST block from the directory.
+ *
+ * Ignore EDQUOT and ENOSPC being returned via nospace_error because
+ * the directory code can handle a reservationless update and we don't
+ * want to prevent a user from trying to free space by deleting things.
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
- &tp);
- }
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
+ &tp, &dontcare);
if (error) {
ASSERT(error != -ENOSPC);
goto std_return;
}
- xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
/*
* If we're removing a directory perform some additional validation.
*/
@@ -2941,6 +2468,19 @@ xfs_remove(
error = xfs_droplink(tp, ip);
if (error)
goto out_trans_cancel;
+
+ /*
+ * Point the unlinked child directory's ".." entry to the root
+ * directory to eliminate back-references to inodes that may
+ * get freed before the child directory is closed. If the fs
+ * gets shrunk, this can lead to dirent inode validation errors.
+ */
+ if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+ error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+ tp->t_mountp->m_sb.sb_rootino, 0);
+ if (error)
+ return error;
+ }
} else {
/*
* When removing a non-directory we need to log the parent
@@ -2967,7 +2507,7 @@ xfs_remove(
* remove transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -3044,7 +2584,7 @@ xfs_finish_rename(
* If this is a synchronous mount, make sure that the rename transaction
* goes to disk before returning to the user.
*/
- if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
@@ -3053,7 +2593,7 @@ xfs_finish_rename(
/*
* xfs_cross_rename()
*
- * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
+ * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
*/
STATIC int
xfs_cross_rename(
@@ -3162,23 +2702,36 @@ out_trans_abort:
/*
* xfs_rename_alloc_whiteout()
*
- * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * Return a referenced, unlinked, unlocked inode that can be used as a
* whiteout in a rename transaction. We use a tmpfile inode here so that if we
* crash between allocating the inode and linking it into the rename transaction
* recovery will free the inode and we won't leak it.
*/
static int
xfs_rename_alloc_whiteout(
+ struct user_namespace *mnt_userns,
+ struct xfs_name *src_name,
struct xfs_inode *dp,
struct xfs_inode **wip)
{
struct xfs_inode *tmpfile;
+ struct qstr name;
int error;
- error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+ error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
+ &tmpfile);
if (error)
return error;
+ name.name = src_name->name;
+ name.len = src_name->len;
+ error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
+ if (error) {
+ xfs_finish_inode_setup(tmpfile);
+ xfs_irele(tmpfile);
+ return error;
+ }
+
/*
* Prepare the tmpfile inode as if it were created through the VFS.
* Complete the inode setup and flag it as linkable. nlink is already
@@ -3197,6 +2750,7 @@ xfs_rename_alloc_whiteout(
*/
int
xfs_rename(
+ struct user_namespace *mnt_userns,
struct xfs_inode *src_dp,
struct xfs_name *src_name,
struct xfs_inode *src_ip,
@@ -3209,12 +2763,13 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
- struct xfs_buf *agibp;
+ int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
int spaceres;
- int error;
+ bool retried = false;
+ int error, nospace_error = 0;
trace_xfs_rename(src_dp, target_dp, src_name, target_name);
@@ -3227,8 +2782,8 @@ xfs_rename(
* appropriately.
*/
if (flags & RENAME_WHITEOUT) {
- ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
- error = xfs_rename_alloc_whiteout(target_dp, &wip);
+ error = xfs_rename_alloc_whiteout(mnt_userns, src_name,
+ target_dp, &wip);
if (error)
return error;
@@ -3239,9 +2794,12 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
+retry:
+ nospace_error = 0;
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
+ nospace_error = error;
spaceres = 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
&tp);
@@ -3260,7 +2818,7 @@ xfs_rename(
* Lock all the participating inodes. Depending upon whether
* the target_name exists in the target directory, and
* whether the target directory is the same as the source
- * directory, we can lock from 2 to 4 inodes.
+ * directory, we can lock from 2 to 5 inodes.
*/
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
@@ -3283,8 +2841,8 @@ xfs_rename(
* into our tree when the project IDs are the same; else the
* tree quota mechanism would be circumvented.
*/
- if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
+ if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+ target_dp->i_projid != src_ip->i_projid)) {
error = -EXDEV;
goto out_trans_cancel;
}
@@ -3296,6 +2854,31 @@ xfs_rename(
spaceres);
/*
+ * Try to reserve quota to handle an expansion of the target directory.
+ * We'll allow the rename to continue in reservationless mode if we hit
+ * a space usage constraint. If we trigger reservationless mode, save
+ * the errno if there isn't any free space in the target directory.
+ */
+ if (spaceres != 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
+ 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(target_dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ nospace_error = error;
+ spaceres = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*/
@@ -3323,6 +2906,32 @@ xfs_rename(
}
/*
+ * Lock the AGI buffers we need to handle bumping the nlink of the
+ * whiteout inode off the unlinked list and to handle dropping the
+ * nlink of the target inode. Per locking order rules, do this in
+ * increasing AG order and before directory block allocation tries to
+ * grab AGFs because we grab AGIs before AGFs.
+ *
+ * The (vfs) caller must ensure that if src is a directory then
+ * target_ip is either null or an empty directory.
+ */
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ if (inodes[i] == wip ||
+ (inodes[i] == target_ip &&
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
+
+ pag = xfs_perag_get(mp,
+ XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
+ error = xfs_read_agi(pag, tp, &bp);
+ xfs_perag_put(pag);
+ if (error)
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
* Directory entry creation below may acquire the AGF. Remove
* the whiteout from the unlinked list first to preserve correct
* AGI/AGF locking order. This dirties the transaction so failures
@@ -3335,8 +2944,13 @@ xfs_rename(
* in future.
*/
if (wip) {
+ struct xfs_perag *pag;
+
ASSERT(VFS_I(wip)->i_nlink == 0);
- error = xfs_iunlink_remove(tp, wip);
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
+ error = xfs_iunlink_remove(tp, pag, wip);
+ xfs_perag_put(pag);
if (error)
goto out_trans_cancel;
@@ -3374,22 +2988,6 @@ xfs_rename(
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
-
- /*
- * Check whether the replace operation will need to allocate
- * blocks. This happens when the shortform directory lacks
- * space and we have to convert it to a block format directory.
- * When more blocks are necessary, we must lock the AGI first
- * to preserve locking order (AGI -> AGF).
- */
- if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
- error = xfs_read_agi(mp, tp,
- XFS_INO_TO_AGNO(mp, target_ip->i_ino),
- &agibp);
- if (error)
- goto out_trans_cancel;
- }
-
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
@@ -3462,12 +3060,13 @@ xfs_rename(
* inode number of the whiteout inode rather than removing it
* altogether.
*/
- if (wip) {
+ if (wip)
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else
+ else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
+
if (error)
goto out_trans_cancel;
@@ -3486,394 +3085,105 @@ out_trans_cancel:
out_release_wip:
if (wip)
xfs_irele(wip);
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
-STATIC int
-xfs_iflush_cluster(
- struct xfs_inode *ip,
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- unsigned long first_index, mask;
- int cilist_size;
- struct xfs_inode **cilist;
- struct xfs_inode *cip;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- int nr_found;
- int clcount = 0;
- int i;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
- cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
- cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
- if (!cilist)
- goto out_put;
-
- mask = ~(igeo->inodes_per_cluster - 1);
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
- rcu_read_lock();
- /* really need a gang lookup range call here */
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
- first_index, igeo->inodes_per_cluster);
- if (nr_found == 0)
- goto out_free;
-
- for (i = 0; i < nr_found; i++) {
- cip = cilist[i];
- if (cip == ip)
- continue;
-
- /*
- * because this is an RCU protected lookup, we could find a
- * recently freed or even reallocated inode during the lookup.
- * We need to check under the i_flags_lock for a valid inode
- * here. Skip it if it is not valid or the wrong inode.
- */
- spin_lock(&cip->i_flags_lock);
- if (!cip->i_ino ||
- __xfs_iflags_test(cip, XFS_ISTALE)) {
- spin_unlock(&cip->i_flags_lock);
- continue;
- }
-
- /*
- * Once we fall off the end of the cluster, no point checking
- * any more inodes in the list because they will also all be
- * outside the cluster.
- */
- if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
- spin_unlock(&cip->i_flags_lock);
- break;
- }
- spin_unlock(&cip->i_flags_lock);
-
- /*
- * Do an un-protected check to see if the inode is dirty and
- * is a candidate for flushing. These checks will be repeated
- * later after the appropriate locks are acquired.
- */
- if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
- continue;
-
- /*
- * Try to get locks. If any are unavailable or it is pinned,
- * then this inode cannot be flushed and is skipped.
- */
-
- if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
- continue;
- if (!xfs_iflock_nowait(cip)) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
- if (xfs_ipincount(cip)) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
-
- /*
- * Check the inode number again, just to be certain we are not
- * racing with freeing in xfs_reclaim_inode(). See the comments
- * in that function for more information as to why the initial
- * check is not sufficient.
- */
- if (!cip->i_ino) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
- /*
- * arriving here means that this inode can be flushed. First
- * re-check that it's dirty before flushing.
- */
- if (!xfs_inode_clean(cip)) {
- int error;
- error = xfs_iflush_int(cip, bp);
- if (error) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- goto cluster_corrupt_out;
- }
- clcount++;
- } else {
- xfs_ifunlock(cip);
- }
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- }
-
- if (clcount) {
- XFS_STATS_INC(mp, xs_icluster_flushcnt);
- XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
- }
-
-out_free:
- rcu_read_unlock();
- kmem_free(cilist);
-out_put:
- xfs_perag_put(pag);
- return 0;
-
-
-cluster_corrupt_out:
- /*
- * Corruption detected in the clustering loop. Invalidate the
- * inode buffer and shut down the filesystem.
- */
- rcu_read_unlock();
-
- /*
- * We'll always have an inode attached to the buffer for completion
- * process by the time we are called from xfs_iflush(). Hence we have
- * always need to do IO completion processing to abort the inodes
- * attached to the buffer. handle them just like the shutdown case in
- * xfs_buf_submit().
- */
- ASSERT(bp->b_iodone);
- bp->b_flags |= XBF_ASYNC;
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioerror(bp, -EIO);
- xfs_buf_ioend(bp);
-
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
- /* abort the corrupt inode, as it was not attached to the buffer */
- xfs_iflush_abort(cip, false);
- kmem_free(cilist);
- xfs_perag_put(pag);
- return -EFSCORRUPTED;
-}
-
-/*
- * Flush dirty inode metadata into the backing buffer.
- *
- * The caller must have the inode lock and the inode flush lock held. The
- * inode lock will still be held upon return to the caller, and the inode
- * flush lock will be released after the inode has reached the disk.
- *
- * The caller must write out the buffer returned in *bpp and release it.
- */
-int
+static int
xfs_iflush(
struct xfs_inode *ip,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp = NULL;
- struct xfs_dinode *dip;
- int error;
-
- XFS_STATS_INC(mp, xs_iflush_count);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
- *bpp = NULL;
-
- xfs_iunpin_wait(ip);
-
- /*
- * For stale inodes we cannot rely on the backing buffer remaining
- * stale in cache for the remaining life of the stale inode and so
- * xfs_imap_to_bp() below may give us a buffer that no longer contains
- * inodes below. We have to check this after ensuring the inode is
- * unpinned so that it is safe to reclaim the stale inode after the
- * flush call.
- */
- if (xfs_iflags_test(ip, XFS_ISTALE)) {
- xfs_ifunlock(ip);
- return 0;
- }
-
- /*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this inode
- * to disk, because the log record didn't make it to disk.
- *
- * We also have to remove the log item from the AIL in this case,
- * as we wait for an empty AIL as part of the unmount process.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = -EIO;
- goto abort_out;
- }
-
- /*
- * Get the buffer containing the on-disk inode. We are doing a try-lock
- * operation here, so we may get an EAGAIN error. In that case, we
- * simply want to return with the inode still dirty.
- *
- * If we get any other error, we effectively have a corruption situation
- * and we cannot flush the inode, so we treat it the same as failing
- * xfs_iflush_int().
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
- 0);
- if (error == -EAGAIN) {
- xfs_ifunlock(ip);
- return error;
- }
- if (error)
- goto corrupt_out;
-
- /*
- * First flush out the inode that xfs_iflush was called with.
- */
- error = xfs_iflush_int(ip, bp);
- if (error)
- goto corrupt_out;
-
- /*
- * If the buffer is pinned then push on the log now so we won't
- * get stuck waiting in the write for too long.
- */
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
-
- /*
- * inode clustering: try to gather other inodes into this write
- *
- * Note: Any error during clustering will result in the filesystem
- * being shut down and completion callbacks run on the cluster buffer.
- * As we have already flushed and attached this inode to the buffer,
- * it has already been aborted and released by xfs_iflush_cluster() and
- * so we have no further error handling to do here.
- */
- error = xfs_iflush_cluster(ip, bp);
- if (error)
- return error;
-
- *bpp = bp;
- return 0;
-
-corrupt_out:
- if (bp)
- xfs_buf_relse(bp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-abort_out:
- /* abort the corrupt inode, as it was not attached to the buffer */
- xfs_iflush_abort(ip, false);
- return error;
-}
-
-/*
- * If there are inline format data / attr forks attached to this inode,
- * make sure they're not corrupt.
- */
-bool
-xfs_inode_verify_forks(
- struct xfs_inode *ip)
-{
- struct xfs_ifork *ifp;
- xfs_failaddr_t fa;
-
- fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
- if (fa) {
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
- ifp->if_u1.if_data, ifp->if_bytes, fa);
- return false;
- }
-
- fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
- if (fa) {
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp ? ifp->if_u1.if_data : NULL,
- ifp ? ifp->if_bytes : 0, fa);
- return false;
- }
- return true;
-}
-
-STATIC int
-xfs_iflush_int(
- struct xfs_inode *ip,
struct xfs_buf *bp)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
struct xfs_dinode *dip;
struct xfs_mount *mp = ip->i_mount;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- ASSERT(iip != NULL && iip->ili_fields != 0);
- ASSERT(ip->i_d.di_version > 1);
+ ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
+ ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
+ ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+ ASSERT(iip->ili_item.li_buf == bp);
- /* set *dip = inode's place in the buffer */
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ /*
+ * We don't flush the inode if any of the following checks fail, but we
+ * do still update the log item and attach to the backing buffer as if
+ * the flush happened. This is a formality to facilitate predictable
+ * error handling as the caller will shutdown and fail the buffer.
+ */
+ error = -EFSCORRUPTED;
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
+ "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
- goto corrupt_out;
+ goto flush_out;
}
if (S_ISREG(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad regular inode %Lu, ptr "PTR_FMT,
+ "%s: Bad regular inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
- goto corrupt_out;
+ goto flush_out;
}
} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad directory inode %Lu, ptr "PTR_FMT,
+ "%s: Bad directory inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
- goto corrupt_out;
+ goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
- ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
+ ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: detected corrupt incore inode %Lu, "
- "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
+ "%s: detected corrupt incore inode %llu, "
+ "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
- ip->i_d.di_nextents + ip->i_d.di_anextents,
- ip->i_d.di_nblocks, ip);
- goto corrupt_out;
+ ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
+ ip->i_nblocks, ip);
+ goto flush_out;
}
- if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+ if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
- __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
- goto corrupt_out;
+ "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
+ __func__, ip->i_ino, ip->i_forkoff, ip);
+ goto flush_out;
}
/*
- * Inode item log recovery for v2 inodes are dependent on the
- * di_flushiter count for correct sequencing. We bump the flush
- * iteration count so we can detect flushes which postdate a log record
- * during recovery. This is redundant as we now log every change and
- * hence this can't happen but we need to still do it to ensure
- * backwards compatibility with old kernels that predate logging all
- * inode changes.
+ * Inode item log recovery for v2 inodes are dependent on the flushiter
+ * count for correct sequencing. We bump the flush iteration count so
+ * we can detect flushes which postdate a log record during recovery.
+ * This is redundant as we now log every change and hence this can't
+ * happen but we need to still do it to ensure backwards compatibility
+ * with old kernels that predate logging all inode changes.
*/
- if (ip->i_d.di_version < 3)
- ip->i_d.di_flushiter++;
+ if (!xfs_has_v3inodes(mp))
+ ip->i_flushiter++;
- /* Check the inline fork data before we write out. */
- if (!xfs_inode_verify_forks(ip))
- goto corrupt_out;
+ /*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they are not corrupt.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
+ xfs_ifork_verify_local_data(ip))
+ goto flush_out;
+ if (xfs_inode_has_attr_fork(ip) &&
+ ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
+ xfs_ifork_verify_local_attr(ip))
+ goto flush_out;
/*
* Copy the dirty parts of the inode into the on-disk inode. We always
@@ -3883,13 +3193,14 @@ xfs_iflush_int(
xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
/* Wrap, we never let the log put out DI_MAX_FLUSH */
- if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
- ip->i_d.di_flushiter = 0;
+ if (!xfs_has_v3inodes(mp)) {
+ if (ip->i_flushiter == DI_MAX_FLUSH)
+ ip->i_flushiter = 0;
+ }
xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
- xfs_inobp_check(mp, bp);
/*
* We've recorded everything logged in the inode, so we'd like to clear
@@ -3902,45 +3213,154 @@ xfs_iflush_int(
*
* What we do is move the bits to the ili_last_fields field. When
* logging the inode, these bits are moved back to the ili_fields field.
- * In the xfs_iflush_done() routine we clear ili_last_fields, since we
- * know that the information those bits represent is permanently on
+ * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
+ * we know that the information those bits represent is permanently on
* disk. As long as the flush completes before the inode is logged
* again, then both ili_fields and ili_last_fields will be cleared.
- *
- * We can play with the ili_fields bits here, because the inode lock
- * must be held exclusively in order to set bits there and the flush
- * lock protects the ili_last_fields bits. Set ili_logged so the flush
- * done routine can tell whether or not to look in the AIL. Also, store
- * the current LSN of the inode so that we can tell whether the item has
- * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
- * need the AIL lock, because it is a 64 bit value that cannot be read
- * atomically.
*/
+ error = 0;
+flush_out:
+ spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
+ spin_unlock(&iip->ili_lock);
+ /*
+ * Store the current LSN of the inode so that we can tell whether the
+ * item has moved in the AIL from xfs_buf_inode_iodone().
+ */
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
+ /* generate the checksum. */
+ xfs_dinode_calc_crc(mp, dip);
+ return error;
+}
+
+/*
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
+ *
+ * The caller must have a reference to the inode and hold the cluster buffer
+ * locked. The function will walk across all the inodes on the cluster buffer it
+ * can find and lock without blocking, and flush them to the cluster buffer.
+ *
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
+ */
+int
+xfs_iflush_cluster(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_log_item *lip, *n;
+ struct xfs_inode *ip;
+ struct xfs_inode_log_item *iip;
+ int clcount = 0;
+ int error = 0;
+
/*
- * Attach the function xfs_iflush_done to the inode's
- * buffer. This will remove the inode from the AIL
- * and unlock the inode's flush lock when the inode is
- * completely written to disk.
+ * We must use the safe variant here as on shutdown xfs_iflush_abort()
+ * will remove itself from the list.
*/
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ iip = (struct xfs_inode_log_item *)lip;
+ ip = iip->ili_inode;
- /* generate the checksum. */
- xfs_dinode_calc_crc(mp, dip);
+ /*
+ * Quick and dirty check to avoid locks if possible.
+ */
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
+ continue;
+ if (xfs_ipincount(ip))
+ continue;
+
+ /*
+ * The inode is still attached to the buffer, which means it is
+ * dirty but reclaim might try to grab it. Check carefully for
+ * that, and grab the ilock while still holding the i_flags_lock
+ * to guarantee reclaim will not be able to reclaim this inode
+ * once we drop the i_flags_lock.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+
+ /*
+ * ILOCK will pin the inode against reclaim and prevent
+ * concurrent transactions modifying the inode while we are
+ * flushing the inode. If we get the lock, set the flushing
+ * state before we drop the i_flags_lock.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Abort flushing this inode if we are shut down because the
+ * inode may not currently be in the AIL. This can occur when
+ * log I/O failure unpins the inode without inserting into the
+ * AIL, leaving a dirty/unpinned inode attached to the buffer
+ * that otherwise looks like it should be flushed.
+ */
+ if (xlog_is_shutdown(mp->m_log)) {
+ xfs_iunpin_wait(ip);
+ xfs_iflush_abort(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ error = -EIO;
+ continue;
+ }
+
+ /* don't block waiting on a log force to unpin dirty inodes */
+ if (xfs_ipincount(ip)) {
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ continue;
+ }
- ASSERT(!list_empty(&bp->b_li_list));
- ASSERT(bp->b_iodone != NULL);
+ if (!xfs_inode_clean(ip))
+ error = xfs_iflush(ip, bp);
+ else
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error)
+ break;
+ clcount++;
+ }
+
+ if (error) {
+ /*
+ * Shutdown first so we kill the log before we release this
+ * buffer. If it is an INODE_ALLOC buffer and pins the tail
+ * of the log, failing it before the _log_ is shut down can
+ * result in the log tail being moved forward in the journal
+ * on disk because log writes can still be taking place. Hence
+ * unpinning the tail will allow the ICREATE intent to be
+ * removed from the log an recovery will fail with uninitialised
+ * inode cluster buffers.
+ */
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_ioend_fail(bp);
+ return error;
+ }
+
+ if (!clcount)
+ return -EAGAIN;
+
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
return 0;
-corrupt_out:
- return -EFSCORRUPTED;
}
/* Release an inode. */
@@ -3951,3 +3371,170 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_csn_t seq = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ seq = ip->i_itemp->ili_commit_seq;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!seq)
+ return 0;
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
+}
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding. The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+ struct inode *src,
+ struct inode *dest)
+{
+ int error;
+
+ if (src > dest)
+ swap(src, dest);
+
+retry:
+ /* Wait to break both inodes' layouts before we start locking. */
+ error = break_layout(src, true);
+ if (error)
+ return error;
+ if (src != dest) {
+ error = break_layout(dest, true);
+ if (error)
+ return error;
+ }
+
+ /* Lock one inode and make sure nobody got in and leased it. */
+ inode_lock(src);
+ error = break_layout(src, false);
+ if (error) {
+ inode_unlock(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ if (src == dest)
+ return 0;
+
+ /* Lock the other inode and make sure nobody got in and leased it. */
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ error = break_layout(dest, false);
+ if (error) {
+ inode_unlock(src);
+ inode_unlock(dest);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ return 0;
+}
+
+static int
+xfs_mmaplock_two_inodes_and_break_dax_layout(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int error;
+ bool retry;
+ struct page *page;
+
+ if (ip1->i_ino > ip2->i_ino)
+ swap(ip1, ip2);
+
+again:
+ retry = false;
+ /* Lock the first inode */
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+ error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
+ if (error || retry) {
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ if (error == 0 && retry)
+ goto again;
+ return error;
+ }
+
+ if (ip1 == ip2)
+ return 0;
+
+ /* Nested lock the second inode */
+ xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
+ /*
+ * We cannot use xfs_break_dax_layouts() directly here because it may
+ * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
+ * for this nested lock case.
+ */
+ page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
+ if (page && page_ref_count(page) != 1) {
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ goto again;
+ }
+
+ return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int ret;
+
+ ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+ if (ret)
+ return ret;
+
+ if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+ ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
+ if (ret) {
+ inode_unlock(VFS_I(ip2));
+ if (ip1 != ip2)
+ inode_unlock(VFS_I(ip1));
+ return ret;
+ }
+ } else
+ filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
+
+ return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ if (ip1 != ip2)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ } else
+ filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
+
+ inode_unlock(VFS_I(ip2));
+ if (ip1 != ip2)
+ inode_unlock(VFS_I(ip1));
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 492e53992fa9..fa780f08dc89 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -33,15 +33,15 @@ typedef struct xfs_inode {
struct xfs_imap i_imap; /* location for xfs_imap() */
/* Extent information. */
- struct xfs_ifork *i_afp; /* attribute fork pointer */
struct xfs_ifork *i_cowfp; /* copy on write extents */
struct xfs_ifork i_df; /* data fork */
+ struct xfs_ifork i_af; /* attribute fork */
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
- mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
+ struct llist_node i_gclist; /* deferred inactivation list */
/*
* Bitsets of inode metadata that have been checked and/or are sick.
@@ -54,11 +54,23 @@ typedef struct xfs_inode {
/* Miscellaneous state. */
unsigned long i_flags; /* see defined flags below */
uint64_t i_delayed_blks; /* count of delay alloc blks */
-
- struct xfs_icdinode i_d; /* most of ondisk inode */
-
- xfs_extnum_t i_cnextents; /* # of extents in cow fork */
- unsigned int i_cformat; /* format of cow fork */
+ xfs_fsize_t i_disk_size; /* number of bytes in file */
+ xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
+ prid_t i_projid; /* owner's project id */
+ xfs_extlen_t i_extsize; /* basic/minimum extent size */
+ /* cowextsize is only used for v3 inodes, flushiter for v1/2 */
+ union {
+ xfs_extlen_t i_cowextsize; /* basic cow extent size */
+ uint16_t i_flushiter; /* incremented on flush */
+ };
+ uint8_t i_forkoff; /* attr fork offset >> 3 */
+ uint16_t i_diflags; /* XFS_DIFLAG_... */
+ uint64_t i_diflags2; /* XFS_DIFLAG2_... */
+ struct timespec64 i_crtime; /* time created */
+
+ /* unlinked list pointers */
+ xfs_agino_t i_next_unlinked;
+ xfs_agino_t i_prev_unlinked;
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -69,6 +81,66 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
+static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+{
+ return ip->i_forkoff > 0;
+}
+
+static inline struct xfs_ifork *
+xfs_ifork_ptr(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return &ip->i_df;
+ case XFS_ATTR_FORK:
+ if (!xfs_inode_has_attr_fork(ip))
+ return NULL;
+ return &ip->i_af;
+ case XFS_COW_FORK:
+ return ip->i_cowfp;
+ default:
+ ASSERT(0);
+ return NULL;
+ }
+}
+
+static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip)
+{
+ return ip->i_forkoff << 3;
+}
+
+static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip)
+{
+ if (xfs_inode_has_attr_fork(ip))
+ return xfs_inode_fork_boff(ip);
+
+ return XFS_LITINO(ip->i_mount);
+}
+
+static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip)
+{
+ if (xfs_inode_has_attr_fork(ip))
+ return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip);
+ return 0;
+}
+
+static inline unsigned int
+xfs_inode_fork_size(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_inode_data_fork_size(ip);
+ case XFS_ATTR_FORK:
+ return xfs_inode_attr_fork_size(ip);
+ default:
+ return 0;
+ }
+}
+
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -90,7 +162,7 @@ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
{
if (S_ISREG(VFS_I(ip)->i_mode))
return i_size_read(VFS_I(ip));
- return ip->i_d.di_size;
+ return ip->i_disk_size;
}
/*
@@ -104,7 +176,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
if (new_size > i_size || new_size < 0)
new_size = i_size;
- return new_size > ip->i_d.di_size ? new_size : 0;
+ return new_size > ip->i_disk_size ? new_size : 0;
}
/*
@@ -177,15 +249,23 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
static inline prid_t
xfs_get_initial_prid(struct xfs_inode *dp)
{
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- return dp->i_d.di_projid;
+ if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
+ return dp->i_projid;
return XFS_PROJID_DEFAULT;
}
static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
{
- return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+ return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
+}
+
+static inline bool xfs_is_metadata_inode(struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ return ip == mp->m_rbmip || ip == mp->m_rsumip ||
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
/*
@@ -197,6 +277,16 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
return ip->i_cowfp && ip->i_cowfp->if_bytes;
}
+static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
+}
+
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -210,16 +300,15 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
-#define __XFS_INEW_BIT 3 /* inode has just been allocated */
-#define XFS_INEW (1 << __XFS_INEW_BIT)
+#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
-#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
-#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
-#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
-#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */
+#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */
+#define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */
/*
* If this unlinked inode is in the middle of recovery, don't let drop_inode
* truncate and free the inode. This can happen if we iget the inode during
@@ -229,55 +318,41 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
#define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */
/*
+ * If we need to update on-disk metadata before this IRECLAIMABLE inode can be
+ * freed, then NEED_INACTIVE will be set. Once we start the updates, the
+ * INACTIVATING bit will be set to keep iget away from this inode. After the
+ * inactivation completes, both flags will be cleared and the inode is a
+ * plain old IRECLAIMABLE inode.
+ */
+#define XFS_INACTIVATING (1 << 13)
+
+/* All inode state flags related to inode reclaim. */
+#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
+ XFS_IRECLAIM | \
+ XFS_NEED_INACTIVE | \
+ XFS_INACTIVATING)
+
+/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
* inode lookup. This prevents unintended behaviour on the new inode from
* ocurring.
*/
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
- XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
-
-/*
- * Synchronize processes attempting to flush the in-core inode back to disk.
- */
-
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
- return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
-extern void __xfs_iflock(struct xfs_inode *ip);
-
-static inline int xfs_iflock_nowait(struct xfs_inode *ip)
-{
- return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
-}
-
-static inline void xfs_iflock(struct xfs_inode *ip)
-{
- if (!xfs_iflock_nowait(ip))
- __xfs_iflock(ip);
-}
-
-static inline void xfs_ifunlock(struct xfs_inode *ip)
-{
- ASSERT(xfs_isiflocked(ip));
- xfs_iflags_clear(ip, XFS_IFLOCK);
- smp_mb();
- wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
-}
+ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
+ XFS_INACTIVATING)
/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
*/
-#define XFS_IOLOCK_EXCL (1<<0)
-#define XFS_IOLOCK_SHARED (1<<1)
-#define XFS_ILOCK_EXCL (1<<2)
-#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_MMAPLOCK_EXCL (1<<4)
-#define XFS_MMAPLOCK_SHARED (1<<5)
+#define XFS_IOLOCK_EXCL (1u << 0)
+#define XFS_IOLOCK_SHARED (1u << 1)
+#define XFS_ILOCK_EXCL (1u << 2)
+#define XFS_ILOCK_SHARED (1u << 3)
+#define XFS_MMAPLOCK_EXCL (1u << 4)
+#define XFS_MMAPLOCK_SHARED (1u << 5)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
@@ -344,19 +419,19 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
*/
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_MAX_SUBCLASS 3
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_DEP_MASK 0x000f0000u
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
#define XFS_MMAPLOCK_MAX_SUBCLASS 3
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u
#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_PARENT_VAL 5u
#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL 6
-#define XFS_ILOCK_RTSUM_VAL 7
-#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_RTBITMAP_VAL 6u
+#define XFS_ILOCK_RTSUM_VAL 7u
+#define XFS_ILOCK_DEP_MASK 0xff000000u
#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
@@ -392,22 +467,25 @@ enum layout_break_reason {
* new subdirectory gets S_ISGID bit from parent.
*/
#define XFS_INHERIT_GID(pip) \
- (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
- (VFS_I(pip)->i_mode & S_ISGID))
+ (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
- umode_t mode, dev_t rdev, struct xfs_inode **ipp);
-int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode,
+int xfs_create(struct user_namespace *mnt_userns,
+ struct xfs_inode *dp, struct xfs_name *name,
+ umode_t mode, dev_t rdev, bool need_xattr,
+ struct xfs_inode **ipp);
+int xfs_create_tmpfile(struct user_namespace *mnt_userns,
+ struct xfs_inode *dp, umode_t mode,
struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
struct xfs_name *target_name);
-int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+int xfs_rename(struct user_namespace *mnt_userns,
+ struct xfs_inode *src_dp, struct xfs_name *src_name,
struct xfs_inode *src_ip, struct xfs_inode *target_dp,
struct xfs_name *target_name,
struct xfs_inode *target_ip, unsigned int flags);
@@ -416,7 +494,7 @@ void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
-int xfs_isilocked(xfs_inode_t *, uint);
+bool xfs_isilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
@@ -426,19 +504,21 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
-int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
+int xfs_iflush_cluster(struct xfs_buf *);
void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
struct xfs_inode *ip1, uint ip1_mode);
xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
-int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
- xfs_nlink_t, dev_t, prid_t,
- struct xfs_inode **);
+int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp,
+ struct xfs_inode *pip, xfs_ino_t ino, umode_t mode,
+ xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs,
+ struct xfs_inode **ipp);
static inline int
xfs_itruncate_extents(
@@ -451,21 +531,14 @@ xfs_itruncate_extents(
}
/* from xfs_file.c */
-enum xfs_prealloc_flags {
- XFS_PREALLOC_SET = (1 << 1),
- XFS_PREALLOC_CLEAR = (1 << 2),
- XFS_PREALLOC_SYNC = (1 << 3),
- XFS_PREALLOC_INVISIBLE = (1 << 4),
-};
-
-int xfs_update_prealloc_flags(struct xfs_inode *ip,
- enum xfs_prealloc_flags flags);
+int xfs_break_dax_layouts(struct inode *inode, bool *retry);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
/* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
+extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
/*
* When setting up a newly allocated inode, we need to call
@@ -479,7 +552,6 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
xfs_iflags_clear(ip, XFS_INEW);
barrier();
unlock_new_inode(VFS_I(ip));
- wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
}
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
@@ -491,16 +563,16 @@ static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
void xfs_irele(struct xfs_inode *ip);
-extern struct kmem_zone *xfs_inode_zone;
+extern struct kmem_cache *xfs_inode_cache;
/* The default CoW extent size hint. */
#define XFS_DEFAULT_COWEXTSZ_HINT 32
-bool xfs_inode_verify_forks(struct xfs_inode *ip);
-
-int xfs_iunlink_init(struct xfs_perag *pag);
-void xfs_iunlink_destroy(struct xfs_perag *pag);
+bool xfs_inode_needs_inactive(struct xfs_inode *ip);
void xfs_end_io(struct work_struct *work);
+int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 8bd5d0de6321..ca2941ab6cbc 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,17 +17,32 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include <linux/iversion.h>
-kmem_zone_t *xfs_ili_zone; /* inode log item zone */
+struct kmem_cache *xfs_ili_cache; /* inode log item */
static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
+/*
+ * The logged size of an inode fork is always the current size of the inode
+ * fork. This means that when an inode fork is relogged, the size of the logged
+ * region is determined by the current state, not the combination of the
+ * previously logged state + the current state. This is different relogging
+ * behaviour to most other log items which will retain the size of the
+ * previously logged changes when smaller regions are relogged.
+ *
+ * Hence operations that remove data from the inode fork (e.g. shortform
+ * dir/attr remove, extent form extent removal, etc), the size of the relogged
+ * inode gets -smaller- rather than stays the same size as the previously logged
+ * size and this can result in the committing transaction reducing the amount of
+ * space being consumed by the CIL.
+ */
STATIC void
xfs_inode_item_data_fork_size(
struct xfs_inode_log_item *iip,
@@ -36,13 +51,13 @@ xfs_inode_item_data_fork_size(
{
struct xfs_inode *ip = iip->ili_inode;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
- ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
/* worst case, doesn't subtract delalloc extents */
- *nbytes += XFS_IFORK_DSIZE(ip);
+ *nbytes += xfs_inode_data_fork_size(ip);
*nvecs += 1;
}
break;
@@ -56,7 +71,7 @@ xfs_inode_item_data_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- *nbytes += roundup(ip->i_df.if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
*nvecs += 1;
}
break;
@@ -77,27 +92,27 @@ xfs_inode_item_attr_fork_size(
{
struct xfs_inode *ip = iip->ili_inode;
- switch (ip->i_d.di_aformat) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_d.di_anextents > 0 &&
- ip->i_afp->if_bytes > 0) {
+ ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_bytes > 0) {
/* worst case, doesn't subtract unused space */
- *nbytes += XFS_IFORK_ASIZE(ip);
+ *nbytes += xfs_inode_attr_fork_size(ip);
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0) {
- *nbytes += ip->i_afp->if_broot_bytes;
+ ip->i_af.if_broot_bytes > 0) {
+ *nbytes += ip->i_af.if_broot_bytes;
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0) {
- *nbytes += roundup(ip->i_afp->if_bytes, 4);
+ ip->i_af.if_bytes > 0) {
+ *nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes);
*nvecs += 1;
}
break;
@@ -125,10 +140,10 @@ xfs_inode_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_log_dinode_size(ip->i_d.di_version);
+ xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
@@ -142,13 +157,13 @@ xfs_inode_item_format_data_fork(
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
- ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
struct xfs_bmbt_rec *p;
@@ -189,17 +204,12 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_u1.if_data != NULL);
- ASSERT(ip->i_d.di_size > 0);
+ ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data, data_bytes);
- ilf->ilf_dsize = (unsigned)data_bytes;
+ ip->i_df.if_u1.if_data,
+ ip->i_df.if_bytes);
+ ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
@@ -227,18 +237,18 @@ xfs_inode_item_format_attr_fork(
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
- switch (ip->i_d.di_aformat) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_d.di_anextents > 0 &&
- ip->i_afp->if_bytes > 0) {
+ ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(xfs_iext_count(ip->i_afp) ==
- ip->i_d.di_anextents);
+ ASSERT(xfs_iext_count(&ip->i_af) ==
+ ip->i_af.if_nextents);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -255,13 +265,13 @@ xfs_inode_item_format_attr_fork(
~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0) {
- ASSERT(ip->i_afp->if_broot != NULL);
+ ip->i_af.if_broot_bytes > 0) {
+ ASSERT(ip->i_af.if_broot != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
- ip->i_afp->if_broot,
- ip->i_afp->if_broot_bytes);
- ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+ ip->i_af.if_broot,
+ ip->i_af.if_broot_bytes);
+ ilf->ilf_asize = ip->i_af.if_broot_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ABROOT;
@@ -272,18 +282,12 @@ xfs_inode_item_format_attr_fork(
~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_afp->if_bytes, 4);
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ ip->i_af.if_bytes > 0) {
+ ASSERT(ip->i_af.if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_afp->if_u1.if_data,
- data_bytes);
- ilf->ilf_asize = (unsigned)data_bytes;
+ ip->i_af.if_u1.if_data,
+ ip->i_af.if_bytes);
+ ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -295,64 +299,123 @@ xfs_inode_item_format_attr_fork(
}
}
+/*
+ * Convert an incore timestamp to a log timestamp. Note that the log format
+ * specifies host endian format!
+ */
+static inline xfs_log_timestamp_t
+xfs_inode_to_log_dinode_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_log_legacy_timestamp *lits;
+ xfs_log_timestamp_t its;
+
+ if (xfs_inode_has_bigtime(ip))
+ return xfs_inode_encode_bigtime(tv);
+
+ lits = (struct xfs_log_legacy_timestamp *)&its;
+ lits->t_sec = tv.tv_sec;
+ lits->t_nsec = tv.tv_nsec;
+
+ return its;
+}
+
+/*
+ * The legacy DMAPI fields are only present in the on-disk and in-log inodes,
+ * but not in the in-memory one. But we are guaranteed to have an inode buffer
+ * in memory when logging an inode, so we can just copy it from the on-disk
+ * inode to the in-log inode here so that recovery of file system with these
+ * fields set to non-zero values doesn't lose them. For all other cases we zero
+ * the fields.
+ */
+static void
+xfs_copy_dm_fields_to_log_dinode(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to)
+{
+ struct xfs_dinode *dip;
+
+ dip = xfs_buf_offset(ip->i_itemp->ili_item.li_buf,
+ ip->i_imap.im_boffset);
+
+ if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) {
+ to->di_dmevmask = be32_to_cpu(dip->di_dmevmask);
+ to->di_dmstate = be16_to_cpu(dip->di_dmstate);
+ } else {
+ to->di_dmevmask = 0;
+ to->di_dmstate = 0;
+ }
+}
+
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_big_anextents = xfs_ifork_nextents(&ip->i_af);
+ to->di_nrext64_pad = 0;
+ } else {
+ to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_anextents = xfs_ifork_nextents(&ip->i_af);
+ }
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
struct xfs_log_dinode *to,
xfs_lsn_t lsn)
{
- struct xfs_icdinode *from = &ip->i_d;
struct inode *inode = VFS_I(ip);
to->di_magic = XFS_DINODE_MAGIC;
+ to->di_format = xfs_ifork_format(&ip->i_df);
+ to->di_uid = i_uid_read(inode);
+ to->di_gid = i_gid_read(inode);
+ to->di_projid_lo = ip->i_projid & 0xffff;
+ to->di_projid_hi = ip->i_projid >> 16;
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_uid = from->di_uid;
- to->di_gid = from->di_gid;
- to->di_projid_lo = from->di_projid & 0xffff;
- to->di_projid_hi = from->di_projid >> 16;
-
- memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime.t_sec = inode->i_atime.tv_sec;
- to->di_atime.t_nsec = inode->i_atime.tv_nsec;
- to->di_mtime.t_sec = inode->i_mtime.tv_sec;
- to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
- to->di_ctime.t_sec = inode->i_ctime.tv_sec;
- to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
- to->di_size = from->di_size;
- to->di_nblocks = from->di_nblocks;
- to->di_extsize = from->di_extsize;
- to->di_nextents = from->di_nextents;
- to->di_anextents = from->di_anextents;
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = from->di_dmevmask;
- to->di_dmstate = from->di_dmstate;
- to->di_flags = from->di_flags;
+ to->di_size = ip->i_disk_size;
+ to->di_nblocks = ip->i_nblocks;
+ to->di_extsize = ip->i_extsize;
+ to->di_forkoff = ip->i_forkoff;
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
+ to->di_flags = ip->i_diflags;
+
+ xfs_copy_dm_fields_to_log_dinode(ip, to);
/* log a dummy value to ensure log structure is fully initialised */
to->di_next_unlinked = NULLAGINO;
- if (from->di_version == 3) {
+ if (xfs_has_v3inodes(ip->i_mount)) {
+ to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
- to->di_crtime.t_sec = from->di_crtime.tv_sec;
- to->di_crtime.t_nsec = from->di_crtime.tv_nsec;
- to->di_flags2 = from->di_flags2;
- to->di_cowextsize = from->di_cowextsize;
+ to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
+ to->di_flags2 = ip->i_diflags2;
+ to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
- to->di_flushiter = from->di_flushiter;
+ to->di_version = 2;
+ to->di_flushiter = ip->i_flushiter;
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_log_dinode_iext_counters(ip, to);
}
/*
@@ -370,7 +433,7 @@ xfs_inode_item_format_core(
dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
- xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+ xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
}
/*
@@ -395,8 +458,6 @@ xfs_inode_item_format(
struct xfs_log_iovec *vecp = NULL;
struct xfs_inode_log_format *ilf;
- ASSERT(ip->i_d.di_version > 1);
-
ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
ilf->ilf_type = XFS_LI_INODE;
ilf->ilf_ino = ip->i_ino;
@@ -419,7 +480,7 @@ xfs_inode_item_format(
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
- if (XFS_IFORK_Q(ip)) {
+ if (xfs_inode_has_attr_fork(ip)) {
xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
} else {
iip->ili_fields &=
@@ -441,6 +502,7 @@ xfs_inode_item_pin(
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
atomic_inc(&ip->i_pincount);
@@ -452,6 +514,12 @@ xfs_inode_item_pin(
* item which was previously pinned with a call to xfs_inode_item_pin().
*
* Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ *
+ * Note that unpin can race with inode cluster buffer freeing marking the buffer
+ * stale. In that case, flush completions are run from the buffer unpin call,
+ * which may happen before the inode is unpinned. If we lose the race, there
+ * will be no buffer attached to the log item, but the inode will be marked
+ * XFS_ISTALE.
*/
STATIC void
xfs_inode_item_unpin(
@@ -461,28 +529,12 @@ xfs_inode_item_unpin(
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
+ ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
-/*
- * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
- * have been failed during writeback
- *
- * This informs the AIL that the inode is already flush locked on the next push,
- * and acquires a hold on the buffer to ensure that it isn't reclaimed before
- * dirty data makes it to disk.
- */
-STATIC void
-xfs_inode_item_error(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
- xfs_set_li_failed(lip, bp);
-}
-
STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
@@ -496,69 +548,50 @@ xfs_inode_item_push(
uint rval = XFS_ITEM_SUCCESS;
int error;
- if (xfs_ipincount(ip) > 0)
+ if (!bp || (ip->i_flags & XFS_ISTALE)) {
+ /*
+ * Inode item/buffer is being aborted due to cluster
+ * buffer deletion. Trigger a log force to have that operation
+ * completed and items removed from the AIL before the next push
+ * attempt.
+ */
return XFS_ITEM_PINNED;
+ }
- /*
- * The buffer containing this item failed to be written back
- * previously. Resubmit the buffer for IO.
- */
- if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
- if (!xfs_buf_trylock(bp))
- return XFS_ITEM_LOCKED;
-
- if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list))
- rval = XFS_ITEM_FLUSHING;
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
+ return XFS_ITEM_PINNED;
- xfs_buf_unlock(bp);
- return rval;
- }
+ if (xfs_iflags_test(ip, XFS_IFLUSHING))
+ return XFS_ITEM_FLUSHING;
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
- /*
- * Re-check the pincount now that we stabilized the value by
- * taking the ilock.
- */
- if (xfs_ipincount(ip) > 0) {
- rval = XFS_ITEM_PINNED;
- goto out_unlock;
- }
-
- /*
- * Stale inode items should force out the iclog.
- */
- if (ip->i_flags & XFS_ISTALE) {
- rval = XFS_ITEM_PINNED;
- goto out_unlock;
- }
+ spin_unlock(&lip->li_ailp->ail_lock);
/*
- * Someone else is already flushing the inode. Nothing we can do
- * here but wait for the flush to finish and remove the item from
- * the AIL.
+ * We need to hold a reference for flushing the cluster buffer as it may
+ * fail the buffer without IO submission. In which case, we better get a
+ * reference for that completion because otherwise we don't get a
+ * reference for IO until we queue the buffer for delwri submission.
*/
- if (!xfs_iflock_nowait(ip)) {
- rval = XFS_ITEM_FLUSHING;
- goto out_unlock;
- }
-
- ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
- ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
-
- spin_unlock(&lip->li_ailp->ail_lock);
-
- error = xfs_iflush(ip, &bp);
+ xfs_buf_hold(bp);
+ error = xfs_iflush_cluster(bp);
if (!error) {
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
+ } else {
+ /*
+ * Release the buffer if we were unable to flush anything. On
+ * any other error, the buffer has already been released.
+ */
+ if (error == -EAGAIN)
+ xfs_buf_relse(bp);
+ rval = XFS_ITEM_LOCKED;
}
spin_lock(&lip->li_ailp->ail_lock);
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
return rval;
}
@@ -622,9 +655,9 @@ xfs_inode_item_committed(
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
+ INODE_ITEM(lip)->ili_commit_seq = seq;
return xfs_inode_item_release(lip);
}
@@ -637,7 +670,6 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
.iop_committing = xfs_inode_item_committing,
- .iop_error = xfs_inode_item_error
};
@@ -652,9 +684,11 @@ xfs_inode_item_init(
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
+ iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
iip->ili_inode = ip;
+ spin_lock_init(&iip->ili_lock);
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
&xfs_inode_item_ops);
}
@@ -664,160 +698,290 @@ xfs_inode_item_init(
*/
void
xfs_inode_item_destroy(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
- kmem_cache_free(xfs_ili_zone, ip->i_itemp);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+
+ ASSERT(iip->ili_item.li_buf == NULL);
+
+ ip->i_itemp = NULL;
+ kmem_free(iip->ili_item.li_lv_shadow);
+ kmem_cache_free(xfs_ili_cache, iip);
}
/*
- * This is the inode flushing I/O completion routine. It is called
- * from interrupt level when the buffer containing the inode is
- * flushed to disk. It is responsible for removing the inode item
- * from the AIL if it has not been re-logged, and unlocking the inode's
- * flush lock.
- *
- * To reduce AIL lock traffic as much as possible, we scan the buffer log item
- * list for other inodes that will run this function. We remove them from the
- * buffer list so we can process all the inode IO completions in one AIL lock
- * traversal.
+ * We only want to pull the item from the AIL if it is actually there
+ * and its location in the log has not changed since we started the
+ * flush. Thus, we only bother if the inode's lsn has not changed.
*/
-void
-xfs_iflush_done(
- struct xfs_buf *bp,
- struct xfs_log_item *lip)
+static void
+xfs_iflush_ail_updates(
+ struct xfs_ail *ailp,
+ struct list_head *list)
{
- struct xfs_inode_log_item *iip;
- struct xfs_log_item *blip, *n;
- struct xfs_ail *ailp = lip->li_ailp;
- int need_ail = 0;
- LIST_HEAD(tmp);
+ struct xfs_log_item *lip;
+ xfs_lsn_t tail_lsn = 0;
- /*
- * Scan the buffer IO completions for other inodes being completed and
- * attach them to the current inode log item.
- */
+ /* this is an opencoded batch version of xfs_trans_ail_delete */
+ spin_lock(&ailp->ail_lock);
+ list_for_each_entry(lip, list, li_bio_list) {
+ xfs_lsn_t lsn;
- list_add_tail(&lip->li_bio_list, &tmp);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
+ if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
+ continue;
- list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) {
- if (lip->li_cb != xfs_iflush_done)
+ /*
+ * dgc: Not sure how this happens, but it happens very
+ * occassionaly via generic/388. xfs_iflush_abort() also
+ * silently handles this same "under writeback but not in AIL at
+ * shutdown" condition via xfs_trans_ail_delete().
+ */
+ if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+ ASSERT(xlog_is_shutdown(lip->li_log));
continue;
+ }
+
+ lsn = xfs_ail_delete_one(ailp, lip);
+ if (!tail_lsn && lsn)
+ tail_lsn = lsn;
+ }
+ xfs_ail_update_finish(ailp, tail_lsn);
+}
+
+/*
+ * Walk the list of inodes that have completed their IOs. If they are clean
+ * remove them from the list and dissociate them from the buffer. Buffers that
+ * are still dirty remain linked to the buffer and on the list. Caller must
+ * handle them appropriately.
+ */
+static void
+xfs_iflush_finish(
+ struct xfs_buf *bp,
+ struct list_head *list)
+{
+ struct xfs_log_item *lip, *n;
+
+ list_for_each_entry_safe(lip, n, list, li_bio_list) {
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ bool drop_buffer = false;
+
+ spin_lock(&iip->ili_lock);
- list_move_tail(&blip->li_bio_list, &tmp);
/*
- * while we have the item, do the unlocked check for needing
- * the AIL lock.
+ * Remove the reference to the cluster buffer if the inode is
+ * clean in memory and drop the buffer reference once we've
+ * dropped the locks we hold.
*/
- iip = INODE_ITEM(blip);
- if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
- test_bit(XFS_LI_FAILED, &blip->li_flags))
- need_ail++;
+ ASSERT(iip->ili_item.li_buf == bp);
+ if (!iip->ili_fields) {
+ iip->ili_item.li_buf = NULL;
+ list_del_init(&lip->li_bio_list);
+ drop_buffer = true;
+ }
+ iip->ili_last_fields = 0;
+ iip->ili_flush_lsn = 0;
+ spin_unlock(&iip->ili_lock);
+ xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
+ if (drop_buffer)
+ xfs_buf_rele(bp);
}
+}
- /* make sure we capture the state of the initial inode. */
- iip = INODE_ITEM(lip);
- if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
- test_bit(XFS_LI_FAILED, &lip->li_flags))
- need_ail++;
+/*
+ * Inode buffer IO completion routine. It is responsible for removing inodes
+ * attached to the buffer from the AIL if they have not been re-logged and
+ * completing the inode flush.
+ */
+void
+xfs_buf_inode_iodone(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip, *n;
+ LIST_HEAD(flushed_inodes);
+ LIST_HEAD(ail_updates);
/*
- * We only want to pull the item from the AIL if it is
- * actually there and its location in the log has not
- * changed since we started the flush. Thus, we only bother
- * if the ili_logged flag is set and the inode's lsn has not
- * changed. First we check the lsn outside
- * the lock since it's cheaper, and then we recheck while
- * holding the lock before removing the inode from the AIL.
+ * Pull the attached inodes from the buffer one at a time and take the
+ * appropriate action on them.
*/
- if (need_ail) {
- bool mlip_changed = false;
-
- /* this is an opencoded batch version of xfs_trans_ail_delete */
- spin_lock(&ailp->ail_lock);
- list_for_each_entry(blip, &tmp, li_bio_list) {
- if (INODE_ITEM(blip)->ili_logged &&
- blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
- mlip_changed |= xfs_ail_delete_one(ailp, blip);
- else {
- xfs_clear_li_failed(blip);
- }
- }
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
+ if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) {
+ xfs_iflush_abort(iip->ili_inode);
+ continue;
}
- spin_unlock(&ailp->ail_lock);
+ if (!iip->ili_last_fields)
+ continue;
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ /* Do an unlocked check for needing the AIL lock. */
+ if (iip->ili_flush_lsn == lip->li_lsn ||
+ test_bit(XFS_LI_FAILED, &lip->li_flags))
+ list_move_tail(&lip->li_bio_list, &ail_updates);
+ else
+ list_move_tail(&lip->li_bio_list, &flushed_inodes);
}
- /*
- * clean up and unlock the flush lock now we are done. We can clear the
- * ili_last_fields bits now that we know that the data corresponding to
- * them is safely on disk.
- */
- list_for_each_entry_safe(blip, n, &tmp, li_bio_list) {
- list_del_init(&blip->li_bio_list);
- iip = INODE_ITEM(blip);
- iip->ili_logged = 0;
- iip->ili_last_fields = 0;
- xfs_ifunlock(iip->ili_inode);
+ if (!list_empty(&ail_updates)) {
+ xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates);
+ list_splice_tail(&ail_updates, &flushed_inodes);
}
- list_del(&tmp);
+
+ xfs_iflush_finish(bp, &flushed_inodes);
+ if (!list_empty(&flushed_inodes))
+ list_splice_tail(&flushed_inodes, &bp->b_li_list);
+}
+
+void
+xfs_buf_inode_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ set_bit(XFS_LI_FAILED, &lip->li_flags);
}
/*
- * This is the inode flushing abort routine. It is called from xfs_iflush when
- * the filesystem is shutting down to clean up the inode state. It is
- * responsible for removing the inode item from the AIL if it has not been
- * re-logged, and unlocking the inode's flush lock.
+ * Clear the inode logging fields so no more flushes are attempted. If we are
+ * on a buffer list, it is now safe to remove it because the buffer is
+ * guaranteed to be locked. The caller will drop the reference to the buffer
+ * the log item held.
+ */
+static void
+xfs_iflush_abort_clean(
+ struct xfs_inode_log_item *iip)
+{
+ iip->ili_last_fields = 0;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ iip->ili_flush_lsn = 0;
+ iip->ili_item.li_buf = NULL;
+ list_del_init(&iip->ili_item.li_bio_list);
+}
+
+/*
+ * Abort flushing the inode from a context holding the cluster buffer locked.
+ *
+ * This is the normal runtime method of aborting writeback of an inode that is
+ * attached to a cluster buffer. It occurs when the inode and the backing
+ * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster
+ * flushing or buffer IO completion encounters a log shutdown situation.
+ *
+ * If we need to abort inode writeback and we don't already hold the buffer
+ * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be
+ * necessary in a shutdown situation.
*/
void
xfs_iflush_abort(
- xfs_inode_t *ip,
- bool stale)
+ struct xfs_inode *ip)
{
- xfs_inode_log_item_t *iip = ip->i_itemp;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_buf *bp;
- if (iip) {
- if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
- xfs_trans_ail_remove(&iip->ili_item,
- stale ? SHUTDOWN_LOG_IO_ERROR :
- SHUTDOWN_CORRUPT_INCORE);
- }
- iip->ili_logged = 0;
- /*
- * Clear the ili_last_fields bits now that we know that the
- * data corresponding to them is safely on disk.
- */
- iip->ili_last_fields = 0;
- /*
- * Clear the inode logging fields so no more flushes are
- * attempted.
- */
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
+ if (!iip) {
+ /* clean inode, nothing to do */
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ return;
}
+
/*
- * Release the inode's flush lock since we're done with it.
+ * Remove the inode item from the AIL before we clear its internal
+ * state. Whilst the inode is in the AIL, it should have a valid buffer
+ * pointer for push operations to access - it is only safe to remove the
+ * inode from the buffer once it has been removed from the AIL.
+ *
+ * We also clear the failed bit before removing the item from the AIL
+ * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
+ * references the inode item owns and needs to hold until we've fully
+ * aborted the inode log item and detached it from the buffer.
*/
- xfs_ifunlock(ip);
+ clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
+ xfs_trans_ail_delete(&iip->ili_item, 0);
+
+ /*
+ * Grab the inode buffer so can we release the reference the inode log
+ * item holds on it.
+ */
+ spin_lock(&iip->ili_lock);
+ bp = iip->ili_item.li_buf;
+ xfs_iflush_abort_clean(iip);
+ spin_unlock(&iip->ili_lock);
+
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ if (bp)
+ xfs_buf_rele(bp);
}
+/*
+ * Abort an inode flush in the case of a shutdown filesystem. This can be called
+ * from anywhere with just an inode reference and does not require holding the
+ * inode cluster buffer locked. If the inode is attached to a cluster buffer,
+ * it will grab and lock it safely, then abort the inode flush.
+ */
void
-xfs_istale_done(
- struct xfs_buf *bp,
- struct xfs_log_item *lip)
+xfs_iflush_shutdown_abort(
+ struct xfs_inode *ip)
{
- xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_buf *bp;
+
+ if (!iip) {
+ /* clean inode, nothing to do */
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ return;
+ }
+
+ spin_lock(&iip->ili_lock);
+ bp = iip->ili_item.li_buf;
+ if (!bp) {
+ spin_unlock(&iip->ili_lock);
+ xfs_iflush_abort(ip);
+ return;
+ }
+
+ /*
+ * We have to take a reference to the buffer so that it doesn't get
+ * freed when we drop the ili_lock and then wait to lock the buffer.
+ * We'll clean up the extra reference after we pick up the ili_lock
+ * again.
+ */
+ xfs_buf_hold(bp);
+ spin_unlock(&iip->ili_lock);
+ xfs_buf_lock(bp);
+
+ spin_lock(&iip->ili_lock);
+ if (!iip->ili_item.li_buf) {
+ /*
+ * Raced with another removal, hold the only reference
+ * to bp now. Inode should not be in the AIL now, so just clean
+ * up and return;
+ */
+ ASSERT(list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags));
+ xfs_iflush_abort_clean(iip);
+ spin_unlock(&iip->ili_lock);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_buf_relse(bp);
+ return;
+ }
+
+ /*
+ * Got two references to bp. The first will get dropped by
+ * xfs_iflush_abort() when the item is removed from the buffer list, but
+ * we can't drop our reference until _abort() returns because we have to
+ * unlock the buffer as well. Hence we abort and then unlock and release
+ * our reference to the buffer.
+ */
+ ASSERT(iip->ili_item.li_buf == bp);
+ spin_unlock(&iip->ili_lock);
+ xfs_iflush_abort(ip);
+ xfs_buf_relse(bp);
}
+
/*
* convert an xfs_inode_log_format struct from the old 32 bit version
* (which can have different field alignments) to the native 64 bit version
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 07a60e74c39c..bbd836a44ff0 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -13,31 +13,41 @@ struct xfs_bmbt_rec;
struct xfs_inode;
struct xfs_mount;
-typedef struct xfs_inode_log_item {
+struct xfs_inode_log_item {
struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
- xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
- xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
- unsigned short ili_lock_flags; /* lock flags */
- unsigned short ili_logged; /* flushed logged data */
+ unsigned short ili_lock_flags; /* inode lock flags */
+ /*
+ * The ili_lock protects the interactions between the dirty state and
+ * the flush state of the inode log item. This allows us to do atomic
+ * modifications of multiple state fields without having to hold a
+ * specific inode lock to serialise them.
+ *
+ * We need atomic changes between inode dirtying, inode flushing and
+ * inode completion, but these all hold different combinations of
+ * ILOCK and IFLUSHING and hence we need some other method of
+ * serialising updates to the flush state.
+ */
+ spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
unsigned int ili_fsync_fields; /* logged since last fsync */
-} xfs_inode_log_item_t;
+ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+ xfs_csn_t ili_commit_seq; /* last transaction commit */
+};
-static inline int xfs_inode_clean(xfs_inode_t *ip)
+static inline int xfs_inode_clean(struct xfs_inode *ip)
{
return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
}
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_iflush_abort(struct xfs_inode *, bool);
+extern void xfs_iflush_abort(struct xfs_inode *);
+extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
-extern struct kmem_zone *xfs_ili_zone;
+extern struct kmem_cache *xfs_ili_cache;
#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
new file mode 100644
index 000000000000..0e5dba2343ea
--- /dev/null
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr;
+
+ xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
+ &xfs_inode_buf_ra_ops);
+ } else {
+ struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr;
+
+ xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
+ &xfs_inode_buf_ra_ops);
+ }
+}
+
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one. This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_format *in_f,
+ struct list_head *buffer_list)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+ ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+ if (!ip)
+ return -ENOMEM;
+
+ /* instantiate the inode */
+ ASSERT(dip->di_version >= 3);
+
+ error = xfs_inode_from_disk(ip, dip);
+ if (error)
+ goto out_free_ip;
+
+ if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+ if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+out_free_ip:
+ xfs_inode_free(ip);
+ return error;
+}
+
+static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
+}
+
+/* Convert a log timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_log_dinode_to_disk_ts(
+ struct xfs_log_dinode *from,
+ const xfs_log_timestamp_t its)
+{
+ struct xfs_legacy_timestamp *lts;
+ struct xfs_log_legacy_timestamp *lits;
+ xfs_timestamp_t ts;
+
+ if (xfs_log_dinode_has_bigtime(from))
+ return cpu_to_be64(its);
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lits = (struct xfs_log_legacy_timestamp *)&its;
+ lts->t_sec = cpu_to_be32(lits->t_sec);
+ lts->t_nsec = cpu_to_be32(lits->t_nsec);
+
+ return ts;
+}
+
+static inline bool xfs_log_dinode_has_large_extent_counts(
+ const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ if (xfs_log_dinode_has_large_extent_counts(from)) {
+ to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+ to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+ to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+ } else {
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ }
+
+}
+
+STATIC void
+xfs_log_dinode_to_disk(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
+{
+ to->di_magic = cpu_to_be16(from->di_magic);
+ to->di_mode = cpu_to_be16(from->di_mode);
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = 0;
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_nlink = cpu_to_be32(from->di_nlink);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+
+ to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
+ to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
+ to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
+
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+ to->di_gen = cpu_to_be32(from->di_gen);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(from->di_changecount);
+ to->di_crtime = xfs_log_dinode_to_disk_ts(from,
+ from->di_crtime);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+ to->di_ino = cpu_to_be64(from->di_ino);
+ to->di_lsn = cpu_to_be64(lsn);
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_v3_pad = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+ }
+
+ xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+ struct xfs_mount *mp,
+ struct xfs_log_dinode *ldip)
+{
+ xfs_extnum_t nextents;
+ xfs_aextnum_t anextents;
+
+ if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+ if (!xfs_has_large_extent_counts(mp) ||
+ (ldip->di_nrext64_pad != 0)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode large extent count format",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+ ldip->di_ino, xfs_has_large_extent_counts(mp),
+ ldip->di_nrext64_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_big_nextents;
+ anextents = ldip->di_big_anextents;
+ } else {
+ if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode di_v3_pad",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, di_v3_pad 0x%llx",
+ ldip->di_ino, ldip->di_v3_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_nextents;
+ anextents = ldip->di_anextents;
+ }
+
+ if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+ ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+ anextents, ldip->di_nblocks);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+STATIC int
+xlog_recover_inode_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_inode_log_format *in_f;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ int len;
+ char *src;
+ char *dest;
+ int error;
+ int attr_index;
+ uint fields;
+ struct xfs_log_dinode *ldip;
+ uint isize;
+ int need_free = 0;
+
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ in_f = item->ri_buf[0].i_addr;
+ } else {
+ in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+ need_free = 1;
+ error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+ if (error)
+ goto error;
+ }
+
+ /*
+ * Inode buffers can be freed, look out for it,
+ * and do not replay the inode.
+ */
+ if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) {
+ error = 0;
+ trace_xfs_log_recover_inode_cancel(log, in_f);
+ goto error;
+ }
+ trace_xfs_log_recover_inode_recover(log, in_f);
+
+ error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+ 0, &bp, &xfs_inode_buf_ops);
+ if (error)
+ goto error;
+ ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+ dip = xfs_buf_offset(bp, in_f->ilf_boffset);
+
+ /*
+ * Make sure the place we're flushing out to really looks
+ * like an inode!
+ */
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
+ xfs_alert(mp,
+ "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld",
+ __func__, dip, bp, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ ldip = item->ri_buf[1].i_addr;
+ if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
+ __func__, item, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+
+ /*
+ * If the inode has an LSN in it, recover the inode only if the on-disk
+ * inode's LSN is older than the lsn of the transaction we are
+ * replaying. We can have multiple checkpoints with the same start LSN,
+ * so the current LSN being equal to the on-disk LSN doesn't necessarily
+ * mean that the on-disk inode is more recent than the change being
+ * replayed.
+ *
+ * We must check the current_lsn against the on-disk inode
+ * here because the we can't trust the log dinode to contain a valid LSN
+ * (see comment below before replaying the log dinode for details).
+ *
+ * Note: we still need to replay an owner change even though the inode
+ * is more recent than the transaction as there is no guarantee that all
+ * the btree blocks are more recent than this transaction, too.
+ */
+ if (dip->di_version >= 3) {
+ xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_owner_change;
+ }
+ }
+
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_has_v3inodes(mp) &&
+ ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_release;
+ }
+ }
+
+ /* Take the opportunity to reset the flush iteration count */
+ ldip->di_flushiter = 0;
+
+ if (unlikely(S_ISREG(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for regular file",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ } else if (unlikely(S_ISDIR(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
+ (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for directory",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ }
+
+ error = xlog_dinode_verify_extent_counts(mp, ldip);
+ if (error)
+ goto out_release;
+
+ if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, di_forkoff 0x%x",
+ in_f->ilf_ino, ldip->di_forkoff);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ isize = xfs_log_dinode_size(mp);
+ if (unlikely(item->ri_buf[1].i_len > isize)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+ mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx log dinode size 0x%x",
+ in_f->ilf_ino, item->ri_buf[1].i_len);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+
+ /*
+ * Recover the log dinode inode into the on disk inode.
+ *
+ * The LSN in the log dinode is garbage - it can be zero or reflect
+ * stale in-memory runtime state that isn't coherent with the changes
+ * logged in this transaction or the changes written to the on-disk
+ * inode. Hence we write the current lSN into the inode because that
+ * matches what xfs_iflush() would write inode the inode when flushing
+ * the changes in this transaction.
+ */
+ xfs_log_dinode_to_disk(ldip, dip, current_lsn);
+
+ fields = in_f->ilf_fields;
+ if (fields & XFS_ILOG_DEV)
+ xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
+
+ if (in_f->ilf_size == 2)
+ goto out_owner_change;
+ len = item->ri_buf[2].i_len;
+ src = item->ri_buf[2].i_addr;
+ ASSERT(in_f->ilf_size <= 4);
+ ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+ ASSERT(!(fields & XFS_ILOG_DFORK) ||
+ (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
+
+ switch (fields & XFS_ILOG_DFORK) {
+ case XFS_ILOG_DDATA:
+ case XFS_ILOG_DEXT:
+ memcpy(XFS_DFORK_DPTR(dip), src, len);
+ break;
+
+ case XFS_ILOG_DBROOT:
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+ (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
+ XFS_DFORK_DSIZE(dip, mp));
+ break;
+
+ default:
+ /*
+ * There are no data fork flags set.
+ */
+ ASSERT((fields & XFS_ILOG_DFORK) == 0);
+ break;
+ }
+
+ /*
+ * If we logged any attribute data, recover it. There may or
+ * may not have been any other non-core data logged in this
+ * transaction.
+ */
+ if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+ if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+ attr_index = 3;
+ } else {
+ attr_index = 2;
+ }
+ len = item->ri_buf[attr_index].i_len;
+ src = item->ri_buf[attr_index].i_addr;
+ ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
+
+ switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+ case XFS_ILOG_ADATA:
+ case XFS_ILOG_AEXT:
+ dest = XFS_DFORK_APTR(dip);
+ ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+ memcpy(dest, src, len);
+ break;
+
+ case XFS_ILOG_ABROOT:
+ dest = XFS_DFORK_APTR(dip);
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+ len, (struct xfs_bmdr_block *)dest,
+ XFS_DFORK_ASIZE(dip, mp));
+ break;
+
+ default:
+ xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ }
+
+out_owner_change:
+ /* Recover the swapext owner change unless inode has been deleted */
+ if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
+ (dip->di_mode != 0))
+ error = xfs_recover_inode_owner_change(mp, dip, in_f,
+ buffer_list);
+ /* re-generate the checksum. */
+ xfs_dinode_calc_crc(log->l_mp, dip);
+
+ ASSERT(bp->b_mount == mp);
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+ xfs_buf_relse(bp);
+error:
+ if (need_free)
+ kmem_free(in_f);
+ return error;
+}
+
+const struct xlog_recover_item_ops xlog_inode_item_ops = {
+ .item_type = XFS_LI_INODE,
+ .ra_pass2 = xlog_recover_inode_ra_pass2,
+ .commit_pass2 = xlog_recover_inode_commit_pass2,
+};
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d42de92cb283..1f783e979629 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -15,6 +15,8 @@
#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -35,9 +37,11 @@
#include "xfs_health.h"
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
+#include "xfs_xattr.h"
#include <linux/mount.h>
#include <linux/namei.h>
+#include <linux/fileattr.h>
/*
* xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -292,62 +296,173 @@ xfs_readlink_by_handle(
return error;
}
-STATIC int
-xfs_attrlist_by_handle(
- struct file *parfilp,
- void __user *arg)
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+static void
+xfs_ioc_attr_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen)
{
- int error = -ENOMEM;
- attrlist_cursor_kern_t *cursor;
- struct xfs_fsop_attrlist_handlereq __user *p = arg;
- xfs_fsop_attrlist_handlereq_t al_hreq;
- struct dentry *dentry;
- char *kbuf;
+ struct xfs_attrlist *alist = context->buffer;
+ struct xfs_attrlist_ent *aep;
+ int arraytop;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
- return -EFAULT;
- if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XFS_XATTR_LIST_MAX)
+ ASSERT(!context->seen_enough);
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+
+ /* decrement by the actual bytes used by the attr */
+ context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
+ namelen + 1, sizeof(uint32_t));
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return;
+ }
+
+ aep = context->buffer + context->firstu;
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+}
+
+static unsigned int
+xfs_attr_filter(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_ROOT)
+ return XFS_ATTR_ROOT;
+ if (ioc_flags & XFS_IOC_ATTR_SECURE)
+ return XFS_ATTR_SECURE;
+ return 0;
+}
+
+static unsigned int
+xfs_attr_flags(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_CREATE)
+ return XATTR_CREATE;
+ if (ioc_flags & XFS_IOC_ATTR_REPLACE)
+ return XATTR_REPLACE;
+ return 0;
+}
+
+int
+xfs_ioc_attr_list(
+ struct xfs_inode *dp,
+ void __user *ubuf,
+ size_t bufsize,
+ int flags,
+ struct xfs_attrlist_cursor __user *ucursor)
+{
+ struct xfs_attr_list_context context = { };
+ struct xfs_attrlist *alist;
+ void *buffer;
+ int error;
+
+ if (bufsize < sizeof(struct xfs_attrlist) ||
+ bufsize > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
* Reject flags, only allow namespaces.
*/
- if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+ if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+ if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
return -EINVAL;
- dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ /*
+ * Validate the cursor.
+ */
+ if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
+ return -EFAULT;
+ if (context.cursor.pad1 || context.cursor.pad2)
+ return -EINVAL;
+ if (!context.cursor.initted &&
+ (context.cursor.hashval || context.cursor.blkno ||
+ context.cursor.offset))
+ return -EINVAL;
- kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
- if (!kbuf)
- goto out_dput;
+ buffer = kvzalloc(bufsize, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
- al_hreq.flags, cursor);
+ /*
+ * Initialize the output buffer.
+ */
+ context.dp = dp;
+ context.resynch = 1;
+ context.attr_filter = xfs_attr_filter(flags);
+ context.buffer = buffer;
+ context.bufsize = round_down(bufsize, sizeof(uint32_t));
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_ioc_attr_put_listent;
+
+ alist = context.buffer;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list(&context);
if (error)
- goto out_kfree;
+ goto out_free;
- if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+ if (copy_to_user(ubuf, buffer, bufsize) ||
+ copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
error = -EFAULT;
- goto out_kfree;
- }
+out_free:
+ kmem_free(buffer);
+ return error;
+}
- if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
- error = -EFAULT;
+STATIC int
+xfs_attrlist_by_handle(
+ struct file *parfilp,
+ struct xfs_fsop_attrlist_handlereq __user *p)
+{
+ struct xfs_fsop_attrlist_handlereq al_hreq;
+ struct dentry *dentry;
+ int error = -ENOMEM;
-out_kfree:
- kmem_free(kbuf);
-out_dput:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
+ return -EFAULT;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
+ al_hreq.buflen, al_hreq.flags, &p->pos);
dput(dentry);
return error;
}
-int
+static int
xfs_attrmulti_attr_get(
struct inode *inode,
unsigned char *name,
@@ -355,31 +470,33 @@ xfs_attrmulti_attr_get(
uint32_t *len,
uint32_t flags)
{
- unsigned char *kbuf;
- int error = -EFAULT;
- size_t namelen;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .attr_flags = xfs_attr_flags(flags),
+ .name = name,
+ .namelen = strlen(name),
+ .valuelen = *len,
+ };
+ int error;
if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
- kbuf = kmem_zalloc_large(*len, 0);
- if (!kbuf)
- return -ENOMEM;
- namelen = strlen(name);
- error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len,
- flags);
+ error = xfs_attr_get(&args);
if (error)
goto out_kfree;
- if (copy_to_user(ubuf, kbuf, *len))
+ *len = args.valuelen;
+ if (copy_to_user(ubuf, args.value, args.valuelen))
error = -EFAULT;
out_kfree:
- kmem_free(kbuf);
+ kmem_free(args.value);
return error;
}
-int
+static int
xfs_attrmulti_attr_set(
struct inode *inode,
unsigned char *name,
@@ -387,42 +504,75 @@ xfs_attrmulti_attr_set(
uint32_t len,
uint32_t flags)
{
- unsigned char *kbuf;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .attr_flags = xfs_attr_flags(flags),
+ .name = name,
+ .namelen = strlen(name),
+ };
int error;
- size_t namelen;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
- kbuf = memdup_user(ubuf, len);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
+ if (ubuf) {
+ if (len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+ args.value = memdup_user(ubuf, len);
+ if (IS_ERR(args.value))
+ return PTR_ERR(args.value);
+ args.valuelen = len;
+ }
- namelen = strlen(name);
- error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags);
- if (!error)
- xfs_forget_acl(inode, name, flags);
- kfree(kbuf);
+ error = xfs_attr_change(&args);
+ if (!error && (flags & XFS_IOC_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
+ kfree(args.value);
return error;
}
int
-xfs_attrmulti_attr_remove(
+xfs_ioc_attrmulti_one(
+ struct file *parfilp,
struct inode *inode,
- unsigned char *name,
+ uint32_t opcode,
+ void __user *uname,
+ void __user *value,
+ uint32_t *len,
uint32_t flags)
{
+ unsigned char *name;
int error;
- size_t namelen;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- namelen = strlen(name);
- error = xfs_attr_remove(XFS_I(inode), name, namelen, flags);
- if (!error)
- xfs_forget_acl(inode, name, flags);
+ if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ name = strndup_user(uname, MAXNAMELEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ switch (opcode) {
+ case ATTR_OP_GET:
+ error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
+ break;
+ case ATTR_OP_REMOVE:
+ value = NULL;
+ *len = 0;
+ fallthrough;
+ case ATTR_OP_SET:
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ break;
+ error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
+ mnt_drop_write_file(parfilp);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ kfree(name);
return error;
}
@@ -436,7 +586,6 @@ xfs_attrmulti_by_handle(
xfs_fsop_attrmulti_handlereq_t am_hreq;
struct dentry *dentry;
unsigned int i, size;
- unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -462,148 +611,23 @@ xfs_attrmulti_by_handle(
goto out_dput;
}
- error = -ENOMEM;
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
- if ((ops[i].am_flags & ATTR_ROOT) &&
- (ops[i].am_flags & ATTR_SECURE)) {
- ops[i].am_error = -EINVAL;
- continue;
- }
- ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
-
- ops[i].am_error = strncpy_from_user((char *)attr_name,
- ops[i].am_attrname, MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(
- d_inode(dentry), attr_name,
- ops[i].am_attrvalue, &ops[i].am_length,
- ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_set(
- d_inode(dentry), attr_name,
- ops[i].am_attrvalue, ops[i].am_length,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_remove(
- d_inode(dentry), attr_name,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- ops[i].am_error = -EINVAL;
- }
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
+ ops[i].am_attrname, ops[i].am_attrvalue,
+ &ops[i].am_length, ops[i].am_flags);
}
if (copy_to_user(am_hreq.ops, ops, size))
error = -EFAULT;
- kfree(attr_name);
- out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
return error;
}
-int
-xfs_ioc_space(
- struct file *filp,
- xfs_flock64_t *bf)
-{
- struct inode *inode = file_inode(filp);
- struct xfs_inode *ip = XFS_I(inode);
- struct iattr iattr;
- enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR;
- uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- int error;
-
- if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
- return -EPERM;
-
- if (!(filp->f_mode & FMODE_WRITE))
- return -EBADF;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
- if (xfs_is_always_cow_inode(ip))
- return -EOPNOTSUPP;
-
- if (filp->f_flags & O_DSYNC)
- flags |= XFS_PREALLOC_SYNC;
- if (filp->f_mode & FMODE_NOCMTIME)
- flags |= XFS_PREALLOC_INVISIBLE;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
- if (error)
- goto out_unlock;
- inode_dio_wait(inode);
-
- switch (bf->l_whence) {
- case 0: /*SEEK_SET*/
- break;
- case 1: /*SEEK_CUR*/
- bf->l_start += filp->f_pos;
- break;
- case 2: /*SEEK_END*/
- bf->l_start += XFS_ISIZE(ip);
- break;
- default:
- error = -EINVAL;
- goto out_unlock;
- }
-
- if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- if (bf->l_start > XFS_ISIZE(ip)) {
- error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
- bf->l_start - XFS_ISIZE(ip), 0);
- if (error)
- goto out_unlock;
- }
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = bf->l_start;
- error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
- if (error)
- goto out_unlock;
-
- error = xfs_update_prealloc_flags(ip, flags);
-
-out_unlock:
- xfs_iunlock(ip, iolock);
- mnt_drop_write_file(filp);
- return error;
-}
-
/* Return 0 on success or positive error */
int
xfs_fsbulkstat_one_fmt(
@@ -633,13 +657,15 @@ xfs_fsinumbers_fmt(
STATIC int
xfs_ioc_fsbulkstat(
- xfs_mount_t *mp,
+ struct file *file,
unsigned int cmd,
void __user *arg)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
struct xfs_fsop_bulkreq bulkreq;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
.ocount = 0,
};
xfs_ino_t lastino;
@@ -651,7 +677,7 @@ xfs_ioc_fsbulkstat(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq)))
@@ -788,6 +814,9 @@ xfs_bulk_ireq_setup(
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
return -ECANCELED;
+ if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+ breq->flags |= XFS_IBULK_NREXT64;
+
return 0;
}
@@ -807,20 +836,22 @@ xfs_bulk_ireq_teardown(
/* Handle the v5 bulkstat ioctl. */
STATIC int
xfs_ioc_bulkstat(
- struct xfs_mount *mp,
+ struct file *file,
unsigned int cmd,
struct xfs_bulkstat_req __user *arg)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
struct xfs_bulk_ireq hdr;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
};
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
@@ -870,7 +901,7 @@ xfs_ioc_inumbers(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
@@ -903,7 +934,7 @@ xfs_ioc_fsgeometry(
struct xfs_fsop_geom fsgeo;
size_t len;
- xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
+ xfs_fs_geometry(mp, &fsgeo, struct_version);
if (struct_version <= 3)
len = sizeof(struct xfs_fsop_geom_v1);
@@ -924,6 +955,7 @@ xfs_ioc_ag_geometry(
struct xfs_mount *mp,
void __user *arg)
{
+ struct xfs_perag *pag;
struct xfs_ag_geometry ageo;
int error;
@@ -934,7 +966,12 @@ xfs_ioc_ag_geometry(
if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
return -EINVAL;
- error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
+ pag = xfs_perag_get(mp, ageo.ag_number);
+ if (!pag)
+ return -EINVAL;
+
+ error = xfs_ag_get_geometry(pag, &ageo);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -947,98 +984,72 @@ xfs_ioc_ag_geometry(
* Linux extended inode flags interface.
*/
-STATIC unsigned int
-xfs_merge_ioc_xflags(
- unsigned int flags,
- unsigned int start)
-{
- unsigned int xflags = start;
-
- if (flags & FS_IMMUTABLE_FL)
- xflags |= FS_XFLAG_IMMUTABLE;
- else
- xflags &= ~FS_XFLAG_IMMUTABLE;
- if (flags & FS_APPEND_FL)
- xflags |= FS_XFLAG_APPEND;
- else
- xflags &= ~FS_XFLAG_APPEND;
- if (flags & FS_SYNC_FL)
- xflags |= FS_XFLAG_SYNC;
- else
- xflags &= ~FS_XFLAG_SYNC;
- if (flags & FS_NOATIME_FL)
- xflags |= FS_XFLAG_NOATIME;
- else
- xflags &= ~FS_XFLAG_NOATIME;
- if (flags & FS_NODUMP_FL)
- xflags |= FS_XFLAG_NODUMP;
- else
- xflags &= ~FS_XFLAG_NODUMP;
-
- return xflags;
-}
-
-STATIC unsigned int
-xfs_di2lxflags(
- uint16_t di_flags)
-{
- unsigned int flags = 0;
-
- if (di_flags & XFS_DIFLAG_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
- if (di_flags & XFS_DIFLAG_APPEND)
- flags |= FS_APPEND_FL;
- if (di_flags & XFS_DIFLAG_SYNC)
- flags |= FS_SYNC_FL;
- if (di_flags & XFS_DIFLAG_NOATIME)
- flags |= FS_NOATIME_FL;
- if (di_flags & XFS_DIFLAG_NODUMP)
- flags |= FS_NODUMP_FL;
- return flags;
-}
-
static void
xfs_fill_fsxattr(
struct xfs_inode *ip,
- bool attr,
- struct fsxattr *fa)
+ int whichfork,
+ struct fileattr *fa)
{
- simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
- fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
- fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
- ip->i_mount->m_sb.sb_blocklog;
- fa->fsx_projid = ip->i_d.di_projid;
-
- if (attr) {
- if (ip->i_afp) {
- if (ip->i_afp->if_flags & XFS_IFEXTENTS)
- fa->fsx_nextents = xfs_iext_count(ip->i_afp);
- else
- fa->fsx_nextents = ip->i_d.di_anextents;
- } else
- fa->fsx_nextents = 0;
- } else {
- if (ip->i_df.if_flags & XFS_IFEXTENTS)
- fa->fsx_nextents = xfs_iext_count(&ip->i_df);
- else
- fa->fsx_nextents = ip->i_d.di_nextents;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+
+ fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
+
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) {
+ fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+ } else if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+ /*
+ * Don't let a misaligned extent size hint on a directory
+ * escape to userspace if it won't pass the setattr checks
+ * later.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
+ FS_XFLAG_EXTSZINHERIT);
+ fa->fsx_extsize = 0;
+ } else {
+ fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize);
+ }
}
+
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+ fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize);
+ fa->fsx_projid = ip->i_projid;
+ if (ifp && !xfs_need_iread_extents(ifp))
+ fa->fsx_nextents = xfs_iext_count(ifp);
+ else
+ fa->fsx_nextents = xfs_ifork_nextents(ifp);
}
STATIC int
-xfs_ioc_fsgetxattr(
+xfs_ioc_fsgetxattra(
xfs_inode_t *ip,
- int attr,
void __user *arg)
{
- struct fsxattr fa;
+ struct fileattr fa;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- xfs_fill_fsxattr(ip, attr, &fa);
+ xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ return copy_fsxattr_to_user(&fa, arg);
+}
+
+int
+xfs_fileattr_get(
+ struct dentry *dentry,
+ struct fileattr *fa)
+{
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
return 0;
}
@@ -1049,7 +1060,7 @@ xfs_flags2diflags(
{
/* can't set PREALLOC this way, just preserve it */
uint16_t di_flags =
- (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+ (ip->i_diflags & XFS_DIFLAG_PREALLOC);
if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
@@ -1090,7 +1101,9 @@ xfs_flags2diflags2(
unsigned int xflags)
{
uint64_t di_flags2 =
- (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
+ XFS_DIFLAG2_BIGTIME |
+ XFS_DIFLAG2_NREXT64));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
@@ -1100,139 +1113,69 @@ xfs_flags2diflags2(
return di_flags2;
}
-STATIC void
-xfs_diflags_to_linux(
- struct xfs_inode *ip)
-{
- struct inode *inode = VFS_I(ip);
- unsigned int xflags = xfs_ip2xflags(ip);
-
- if (xflags & FS_XFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (xflags & FS_XFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (xflags & FS_XFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (xflags & FS_XFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
-#if 0 /* disabled until the flag switching races are sorted out */
- if (xflags & FS_XFLAG_DAX)
- inode->i_flags |= S_DAX;
- else
- inode->i_flags &= ~S_DAX;
-#endif
-}
-
static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- uint64_t di_flags2;
+ uint64_t i_flags2;
/* Can't change realtime flag if any extents are allocated. */
- if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+ if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
return -EINVAL;
/* If realtime flag is set then must have realtime device */
if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
- (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
+ (ip->i_extsize % mp->m_sb.sb_rextsize))
return -EINVAL;
}
/* Clear reflink if we are actually able to set the rt flag. */
if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
/* Don't allow us to set DAX mode for a reflinked file for now. */
if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
return -EINVAL;
/* diflags2 only valid for v3 inodes. */
- di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- if (di_flags2 && ip->i_d.di_version < 3)
+ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+ if (i_flags2 && !xfs_has_v3inodes(mp))
return -EINVAL;
- ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
- ip->i_d.di_flags2 = di_flags2;
+ ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+ ip->i_diflags2 = i_flags2;
- xfs_diflags_to_linux(ip);
+ xfs_diflags_to_iflags(ip, false);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
return 0;
}
-/*
- * If we are changing DAX flags, we have to ensure the file is clean and any
- * cached objects in the address space are invalidated and removed. This
- * requires us to lock out other IO and page faults similar to a truncate
- * operation. The locks need to be held until the transaction has been committed
- * so that the cache invalidation is atomic with respect to the DAX flag
- * manipulation.
- */
-static int
-xfs_ioctl_setattr_dax_invalidate(
+static void
+xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fsxattr *fa,
- int *join_flags)
+ struct fileattr *fa)
{
- struct inode *inode = VFS_I(ip);
- struct super_block *sb = inode->i_sb;
- int error;
-
- *join_flags = 0;
-
- /*
- * It is only valid to set the DAX flag on regular files and
- * directories on filesystems where the block size is equal to the page
- * size. On directories it serves as an inherited hint so we don't
- * have to check the device for dax support or flush pagecache.
- */
- if (fa->fsx_xflags & FS_XFLAG_DAX) {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
-
- if (!bdev_dax_supported(target->bt_bdev, sb->s_blocksize))
- return -EINVAL;
- }
-
- /* If the DAX state is not changing, we have nothing to do here. */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
- return 0;
- if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
- return 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
if (S_ISDIR(inode->i_mode))
- return 0;
+ return;
- /* lock, flush and invalidate mapping in preparation for flag change */
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
- error = filemap_write_and_wait(inode->i_mapping);
- if (error)
- goto out_unlock;
- error = invalidate_inode_pages2(inode->i_mapping);
- if (error)
- goto out_unlock;
-
- *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
- return 0;
-
-out_unlock:
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
- return error;
+ if (xfs_has_dax_always(mp) || xfs_has_dax_never(mp))
+ return;
+ if (((fa->fsx_xflags & FS_XFLAG_DAX) &&
+ !(ip->i_diflags2 & XFS_DIFLAG2_DAX)) ||
+ (!(fa->fsx_xflags & FS_XFLAG_DAX) &&
+ (ip->i_diflags2 & XFS_DIFLAG2_DAX)))
+ d_mark_dontcache(inode);
}
/*
@@ -1240,190 +1183,150 @@ out_unlock:
* have permission to do so. On success, return a clean transaction and the
* inode locked exclusively ready for further operation specific checks. On
* failure, return an error without modifying or locking the inode.
- *
- * The inode might already be IO locked on call. If this is the case, it is
- * indicated in @join_flags and we take full responsibility for ensuring they
- * are unlocked from now on. Hence if we have an error here, we still have to
- * unlock them. Otherwise, once they are joined to the transaction, they will
- * be unlocked on commit/cancel.
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
struct xfs_inode *ip,
- int join_flags)
+ struct xfs_dquot *pdqp)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error = -EROFS;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- goto out_unlock;
+ if (xfs_is_readonly(mp))
+ goto out_error;
error = -EIO;
- if (XFS_FORCED_SHUTDOWN(mp))
- goto out_unlock;
+ if (xfs_is_shutdown(mp))
+ goto out_error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
- goto out_unlock;
+ goto out_error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
- join_flags = 0;
-
- /*
- * CAP_FOWNER overrides the following restrictions:
- *
- * The user ID of the calling process must be equal to the file owner
- * ID, except in cases where the CAP_FSETID capability is applicable.
- */
- if (!inode_owner_or_capable(VFS_I(ip))) {
- error = -EPERM;
- goto out_cancel;
- }
-
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
return tp;
-out_cancel:
- xfs_trans_cancel(tp);
-out_unlock:
- if (join_flags)
- xfs_iunlock(ip, join_flags);
+out_error:
return ERR_PTR(error);
}
/*
- * extent size hint validation is somewhat cumbersome. Rules are:
- *
- * 1. extent size hint is only valid for directories and regular files
- * 2. FS_XFLAG_EXTSIZE is only valid for regular files
- * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
- * 4. can only be changed on regular files if no extents are allocated
- * 5. can be changed on directories at any time
- * 6. extsize hint of 0 turns off hints, clears inode flags.
- * 7. Extent size must be a multiple of the appropriate block size.
- * 8. for non-realtime files, the extent size hint must be limited
- * to half the AG size to avoid alignment extending the extent beyond the
- * limits of the AG.
- *
- * Please keep this function in sync with xfs_scrub_inode_extsize.
+ * Validate a proposed extent size hint. For regular files, the hint can only
+ * be changed if no extents are allocated.
*/
static int
xfs_ioctl_setattr_check_extsize(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_extlen_t size;
- xfs_fsblock_t extsize_fsb;
+ xfs_failaddr_t failaddr;
+ uint16_t new_diflags;
- if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
- ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
- return -EINVAL;
-
- if (fa->fsx_extsize == 0)
+ if (!fa->fsx_valid)
return 0;
- extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
- if (extsize_fsb > MAXEXTLEN)
+ if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
+ XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize)
return -EINVAL;
- if (XFS_IS_REALTIME_INODE(ip) ||
- (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
- size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
- } else {
- size = mp->m_sb.sb_blocksize;
- if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
+ if (fa->fsx_extsize & mp->m_blockmask)
+ return -EINVAL;
+
+ new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+
+ /*
+ * Inode verifiers do not check that the extent size hint is an integer
+ * multiple of the rt extent size on a directory with both rtinherit
+ * and extszinherit flags set. Don't let sysadmins misconfigure
+ * directories.
+ */
+ if ((new_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) {
+ unsigned int rtextsize_bytes;
+
+ rtextsize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ if (fa->fsx_extsize % rtextsize_bytes)
return -EINVAL;
}
- if (fa->fsx_extsize % size)
- return -EINVAL;
-
- return 0;
+ failaddr = xfs_inode_validate_extsize(ip->i_mount,
+ XFS_B_TO_FSB(mp, fa->fsx_extsize),
+ VFS_I(ip)->i_mode, new_diflags);
+ return failaddr != NULL ? -EINVAL : 0;
}
-/*
- * CoW extent size hint validation rules are:
- *
- * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
- * The inode does not have to have any shared blocks, but it must be a v3.
- * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
- * for a directory, the hint is propagated to new files.
- * 3. Can be changed on files & directories at any time.
- * 4. CoW extsize hint of 0 turns off hints, clears inode flags.
- * 5. Extent size must be a multiple of the appropriate block size.
- * 6. The extent size hint must be limited to half the AG size to avoid
- * alignment extending the extent beyond the limits of the AG.
- *
- * Please keep this function in sync with xfs_scrub_inode_cowextsize.
- */
static int
xfs_ioctl_setattr_check_cowextsize(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_extlen_t size;
- xfs_fsblock_t cowextsize_fsb;
-
- if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
- return 0;
+ xfs_failaddr_t failaddr;
+ uint64_t new_diflags2;
+ uint16_t new_diflags;
- if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
- ip->i_d.di_version != 3)
- return -EINVAL;
-
- if (fa->fsx_cowextsize == 0)
+ if (!fa->fsx_valid)
return 0;
- cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
- if (cowextsize_fsb > MAXEXTLEN)
- return -EINVAL;
-
- size = mp->m_sb.sb_blocksize;
- if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+ if (fa->fsx_cowextsize & mp->m_blockmask)
return -EINVAL;
- if (fa->fsx_cowextsize % size)
- return -EINVAL;
+ new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+ new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- return 0;
+ failaddr = xfs_inode_validate_cowextsize(ip->i_mount,
+ XFS_B_TO_FSB(mp, fa->fsx_cowextsize),
+ VFS_I(ip)->i_mode, new_diflags, new_diflags2);
+ return failaddr != NULL ? -EINVAL : 0;
}
static int
xfs_ioctl_setattr_check_projid(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
- /* Disallow 32bit project ids if projid32bit feature is not enabled. */
+ if (!fa->fsx_valid)
+ return 0;
+
+ /* Disallow 32bit project ids if 32bit IDs are not enabled. */
if (fa->fsx_projid > (uint16_t)-1 &&
- !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+ !xfs_has_projid32(ip->i_mount))
return -EINVAL;
return 0;
}
-STATIC int
-xfs_ioctl_setattr(
- xfs_inode_t *ip,
- struct fsxattr *fa)
+int
+xfs_fileattr_set(
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct fileattr *fa)
{
- struct fsxattr old_fa;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_dquot *udqp = NULL;
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
- int code;
- int join_flags = 0;
+ int error;
trace_xfs_ioctl_setattr(ip);
- code = xfs_ioctl_setattr_check_projid(ip, fa);
- if (code)
- return code;
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ if (!fa->fsx_valid) {
+ if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
+ FS_NOATIME_FL | FS_NODUMP_FL |
+ FS_SYNC_FL | FS_DAX_FL | FS_PROJINHERIT_FL))
+ return -EOPNOTSUPP;
+ }
+
+ error = xfs_ioctl_setattr_check_projid(ip, fa);
+ if (error)
+ return error;
/*
* If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1433,56 +1336,36 @@ xfs_ioctl_setattr(
* If the IDs do change before we take the ilock, we're covered
* because the i_*dquot fields will get updated anyway.
*/
- if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
- ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
- if (code)
- return code;
+ if (fa->fsx_valid && XFS_IS_QUOTA_ON(mp)) {
+ error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ VFS_I(ip)->i_gid, fa->fsx_projid,
+ XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
+ if (error)
+ return error;
}
- /*
- * Changing DAX config may require inode locking for mapping
- * invalidation. These need to be held all the way to transaction commit
- * or cancel time, so need to be passed through to
- * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
- * appropriately.
- */
- code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
- if (code)
- goto error_free_dquots;
+ xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
+ tp = xfs_ioctl_setattr_get_trans(ip, pdqp);
if (IS_ERR(tp)) {
- code = PTR_ERR(tp);
+ error = PTR_ERR(tp);
goto error_free_dquots;
}
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- ip->i_d.di_projid != fa->fsx_projid) {
- code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
- capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
- if (code) /* out of quota */
- goto error_trans_cancel;
- }
-
- xfs_fill_fsxattr(ip, false, &old_fa);
- code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
- if (code)
- goto error_trans_cancel;
-
- code = xfs_ioctl_setattr_check_extsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_extsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_xflags(tp, ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_xflags(tp, ip, fa);
+ if (error)
goto error_trans_cancel;
+ if (!fa->fsx_valid)
+ goto skip_xattr;
/*
* Change file ownership. Must be the owner or privileged. CAP_FSETID
* overrides the following restrictions:
@@ -1492,17 +1375,16 @@ xfs_ioctl_setattr(
*/
if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
- !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+ !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID))
VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
- if (ip->i_d.di_projid != fa->fsx_projid) {
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+ if (ip->i_projid != fa->fsx_projid) {
+ if (XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
- ASSERT(ip->i_d.di_version > 1);
- ip->i_d.di_projid = fa->fsx_projid;
+ ip->i_projid = fa->fsx_projid;
}
/*
@@ -1510,129 +1392,33 @@ xfs_ioctl_setattr(
* extent size hint should be set on the inode. If no extent size flags
* are set on the inode then unconditionally clear the extent size hint.
*/
- if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
- ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+ if (ip->i_diflags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
+ ip->i_extsize = XFS_B_TO_FSB(mp, fa->fsx_extsize);
else
- ip->i_d.di_extsize = 0;
- if (ip->i_d.di_version == 3 &&
- (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
- ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
- mp->m_sb.sb_blocklog;
- else
- ip->i_d.di_cowextsize = 0;
+ ip->i_extsize = 0;
- code = xfs_trans_commit(tp);
+ if (xfs_has_v3inodes(mp)) {
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+ ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+ else
+ ip->i_cowextsize = 0;
+ }
+
+skip_xattr:
+ error = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
*/
xfs_qm_dqrele(olddquot);
- xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
error_trans_cancel:
xfs_trans_cancel(tp);
error_free_dquots:
- xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
- return code;
-}
-
-STATIC int
-xfs_ioc_fssetxattr(
- xfs_inode_t *ip,
- struct file *filp,
- void __user *arg)
-{
- struct fsxattr fa;
- int error;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
- error = xfs_ioctl_setattr(ip, &fa);
- mnt_drop_write_file(filp);
- return error;
-}
-
-STATIC int
-xfs_ioc_getxflags(
- xfs_inode_t *ip,
- void __user *arg)
-{
- unsigned int flags;
-
- flags = xfs_di2lxflags(ip->i_d.di_flags);
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -EFAULT;
- return 0;
-}
-
-STATIC int
-xfs_ioc_setxflags(
- struct xfs_inode *ip,
- struct file *filp,
- void __user *arg)
-{
- struct xfs_trans *tp;
- struct fsxattr fa;
- struct fsxattr old_fa;
- unsigned int flags;
- int join_flags = 0;
- int error;
-
- if (copy_from_user(&flags, arg, sizeof(flags)))
- return -EFAULT;
-
- if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
- FS_NOATIME_FL | FS_NODUMP_FL | \
- FS_SYNC_FL))
- return -EOPNOTSUPP;
-
- fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- /*
- * Changing DAX config may require inode locking for mapping
- * invalidation. These need to be held all the way to transaction commit
- * or cancel time, so need to be passed through to
- * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
- * appropriately.
- */
- error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
- if (error)
- goto out_drop_write;
-
- tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
- if (IS_ERR(tp)) {
- error = PTR_ERR(tp);
- goto out_drop_write;
- }
-
- xfs_fill_fsxattr(ip, false, &old_fa);
- error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
- if (error) {
- xfs_trans_cancel(tp);
- goto out_drop_write;
- }
-
- error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
- if (error) {
- xfs_trans_cancel(tp);
- goto out_drop_write;
- }
-
- error = xfs_trans_commit(tp);
-out_drop_write:
- mnt_drop_write_file(filp);
return error;
}
@@ -1672,10 +1458,8 @@ xfs_ioc_getbmap(
switch (cmd) {
case XFS_IOC_GETBMAPA:
bmx.bmv_iflags = BMV_IF_ATTRFORK;
- /*FALLTHRU*/
+ fallthrough;
case XFS_IOC_GETBMAP:
- if (file->f_mode & FMODE_NOCMTIME)
- bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
/* struct getbmap is a strict subset of struct getbmapx. */
recsize = sizeof(struct getbmap);
break;
@@ -1691,10 +1475,10 @@ xfs_ioc_getbmap(
if (bmx.bmv_count < 2)
return -EINVAL;
- if (bmx.bmv_count > ULONG_MAX / recsize)
+ if (bmx.bmv_count >= INT_MAX / recsize)
return -ENOMEM;
- buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0);
+ buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -1719,39 +1503,17 @@ out_free_buf:
return error;
}
-struct getfsmap_info {
- struct xfs_mount *mp;
- struct fsmap_head __user *data;
- unsigned int idx;
- __u32 last_flags;
-};
-
-STATIC int
-xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
-{
- struct getfsmap_info *info = priv;
- struct fsmap fm;
-
- trace_xfs_getfsmap_mapping(info->mp, xfm);
-
- info->last_flags = xfm->fmr_flags;
- xfs_fsmap_from_internal(&fm, xfm);
- if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
- sizeof(struct fsmap)))
- return -EFAULT;
-
- return 0;
-}
-
STATIC int
xfs_ioc_getfsmap(
struct xfs_inode *ip,
struct fsmap_head __user *arg)
{
- struct getfsmap_info info = { NULL };
struct xfs_fsmap_head xhead = {0};
struct fsmap_head head;
- bool aborted = false;
+ struct fsmap *recs;
+ unsigned int count;
+ __u32 last_flags = 0;
+ bool done = false;
int error;
if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
@@ -1763,43 +1525,117 @@ xfs_ioc_getfsmap(
sizeof(head.fmh_keys[1].fmr_reserved)))
return -EINVAL;
+ /*
+ * Use an internal memory buffer so that we don't have to copy fsmap
+ * data to userspace while holding locks. Start by trying to allocate
+ * up to 128k for the buffer, but fall back to a single page if needed.
+ */
+ count = min_t(unsigned int, head.fmh_count,
+ 131072 / sizeof(struct fsmap));
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs) {
+ count = min_t(unsigned int, head.fmh_count,
+ PAGE_SIZE / sizeof(struct fsmap));
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs)
+ return -ENOMEM;
+ }
+
xhead.fmh_iflags = head.fmh_iflags;
- xhead.fmh_count = head.fmh_count;
xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
- info.mp = ip->i_mount;
- info.data = arg;
- error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
- if (error == -ECANCELED) {
- error = 0;
- aborted = true;
- } else if (error)
- return error;
+ head.fmh_entries = 0;
+ do {
+ struct fsmap __user *user_recs;
+ struct fsmap *last_rec;
+
+ user_recs = &arg->fmh_recs[head.fmh_entries];
+ xhead.fmh_entries = 0;
+ xhead.fmh_count = min_t(unsigned int, count,
+ head.fmh_count - head.fmh_entries);
+
+ /* Run query, record how many entries we got. */
+ error = xfs_getfsmap(ip->i_mount, &xhead, recs);
+ switch (error) {
+ case 0:
+ /*
+ * There are no more records in the result set. Copy
+ * whatever we got to userspace and break out.
+ */
+ done = true;
+ break;
+ case -ECANCELED:
+ /*
+ * The internal memory buffer is full. Copy whatever
+ * records we got to userspace and go again if we have
+ * not yet filled the userspace buffer.
+ */
+ error = 0;
+ break;
+ default:
+ goto out_free;
+ }
+ head.fmh_entries += xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
- /* If we didn't abort, set the "last" flag in the last fmx */
- if (!aborted && info.idx) {
- info.last_flags |= FMR_OF_LAST;
- if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
- &info.last_flags, sizeof(info.last_flags)))
- return -EFAULT;
+ /*
+ * If the caller wanted a record count or there aren't any
+ * new records to return, we're done.
+ */
+ if (head.fmh_count == 0 || xhead.fmh_entries == 0)
+ break;
+
+ /* Copy all the records we got out to userspace. */
+ if (copy_to_user(user_recs, recs,
+ xhead.fmh_entries * sizeof(struct fsmap))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+ /* Remember the last record flags we copied to userspace. */
+ last_rec = &recs[xhead.fmh_entries - 1];
+ last_flags = last_rec->fmr_flags;
+
+ /* Set up the low key for the next iteration. */
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ } while (!done && head.fmh_entries < head.fmh_count);
+
+ /*
+ * If there are no more records in the query result set and we're not
+ * in counting mode, mark the last record returned with the LAST flag.
+ */
+ if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
+ struct fsmap __user *user_rec;
+
+ last_flags |= FMR_OF_LAST;
+ user_rec = &arg->fmh_recs[head.fmh_entries - 1];
+
+ if (copy_to_user(&user_rec->fmr_flags, &last_flags,
+ sizeof(last_flags))) {
+ error = -EFAULT;
+ goto out_free;
+ }
}
/* copy back header */
- head.fmh_entries = xhead.fmh_entries;
- head.fmh_oflags = xhead.fmh_oflags;
- if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
- return -EFAULT;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
- return 0;
+out_free:
+ kmem_free(recs);
+ return error;
}
STATIC int
xfs_ioc_scrub_metadata(
- struct xfs_inode *ip,
+ struct file *file,
void __user *arg)
{
struct xfs_scrub_metadata scrub;
@@ -1811,7 +1647,7 @@ xfs_ioc_scrub_metadata(
if (copy_from_user(&scrub, arg, sizeof(scrub)))
return -EFAULT;
- error = xfs_scrub_metadata(ip, &scrub);
+ error = xfs_scrub_metadata(file, &scrub);
if (error)
return error;
@@ -1886,7 +1722,7 @@ xfs_ioc_swapext(
goto out_put_tmp_file;
}
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ if (xfs_is_shutdown(ip->i_mount)) {
error = -EIO;
goto out_put_tmp_file;
}
@@ -1983,6 +1819,61 @@ out:
return error;
}
+static inline int
+xfs_fs_eofblocks_from_user(
+ struct xfs_fs_eofblocks *src,
+ struct xfs_icwalk *dst)
+{
+ if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+ return -EINVAL;
+
+ if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+ return -EINVAL;
+
+ if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+ memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+ return -EINVAL;
+
+ dst->icw_flags = 0;
+ if (src->eof_flags & XFS_EOF_FLAGS_SYNC)
+ dst->icw_flags |= XFS_ICWALK_FLAG_SYNC;
+ if (src->eof_flags & XFS_EOF_FLAGS_UID)
+ dst->icw_flags |= XFS_ICWALK_FLAG_UID;
+ if (src->eof_flags & XFS_EOF_FLAGS_GID)
+ dst->icw_flags |= XFS_ICWALK_FLAG_GID;
+ if (src->eof_flags & XFS_EOF_FLAGS_PRID)
+ dst->icw_flags |= XFS_ICWALK_FLAG_PRID;
+ if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE)
+ dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE;
+
+ dst->icw_prid = src->eof_prid;
+ dst->icw_min_file_size = src->eof_min_file_size;
+
+ dst->icw_uid = INVALID_UID;
+ if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+ dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid);
+ if (!uid_valid(dst->icw_uid))
+ return -EINVAL;
+ }
+
+ dst->icw_gid = INVALID_GID;
+ if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+ dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid);
+ if (!gid_valid(dst->icw_gid))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * These long-unused ioctls were removed from the official ioctl API in 5.17,
+ * but retain these definitions so that we can log warnings about them.
+ */
+#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
+
/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
@@ -2013,13 +1904,11 @@ xfs_file_ioctl(
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64: {
- xfs_flock64_t bf;
-
- if (copy_from_user(&bf, arg, sizeof(bf)))
- return -EFAULT;
- return xfs_ioc_space(filp, &bf);
- }
+ case XFS_IOC_FREESP64:
+ xfs_warn_once(mp,
+ "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported",
+ current->comm);
+ return -ENOTTY;
case XFS_IOC_DIOINFO: {
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
struct dioattr da;
@@ -2035,10 +1924,10 @@ xfs_file_ioctl(
case XFS_IOC_FSBULKSTAT_SINGLE:
case XFS_IOC_FSBULKSTAT:
case XFS_IOC_FSINUMBERS:
- return xfs_ioc_fsbulkstat(mp, cmd, arg);
+ return xfs_ioc_fsbulkstat(filp, cmd, arg);
case XFS_IOC_BULKSTAT:
- return xfs_ioc_bulkstat(mp, cmd, arg);
+ return xfs_ioc_bulkstat(filp, cmd, arg);
case XFS_IOC_INUMBERS:
return xfs_ioc_inumbers(mp, cmd, arg);
@@ -2055,16 +1944,8 @@ xfs_file_ioctl(
case XFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
- case XFS_IOC_FSGETXATTR:
- return xfs_ioc_fsgetxattr(ip, 0, arg);
case XFS_IOC_FSGETXATTRA:
- return xfs_ioc_fsgetxattr(ip, 1, arg);
- case XFS_IOC_FSSETXATTR:
- return xfs_ioc_fssetxattr(ip, filp, arg);
- case XFS_IOC_GETXFLAGS:
- return xfs_ioc_getxflags(ip, arg);
- case XFS_IOC_SETXFLAGS:
- return xfs_ioc_setxflags(ip, filp, arg);
+ return xfs_ioc_fsgetxattra(ip, arg);
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
@@ -2075,7 +1956,7 @@ xfs_file_ioctl(
return xfs_ioc_getfsmap(ip, arg);
case XFS_IOC_SCRUB_METADATA:
- return xfs_ioc_scrub_metadata(ip, arg);
+ return xfs_ioc_scrub_metadata(filp, arg);
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
@@ -2137,7 +2018,7 @@ xfs_file_ioctl(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return -EROFS;
if (copy_from_user(&inout, arg, sizeof(inout)))
@@ -2176,7 +2057,7 @@ xfs_file_ioctl(
}
case XFS_IOC_FSGROWFSDATA: {
- xfs_growfs_data_t in;
+ struct xfs_growfs_data in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
@@ -2190,7 +2071,7 @@ xfs_file_ioctl(
}
case XFS_IOC_FSGROWFSLOG: {
- xfs_growfs_log_t in;
+ struct xfs_growfs_log in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
@@ -2248,23 +2129,28 @@ xfs_file_ioctl(
return xfs_errortag_clearall(mp);
case XFS_IOC_FREE_EOFBLOCKS: {
- struct xfs_fs_eofblocks eofb;
- struct xfs_eofblocks keofb;
+ struct xfs_fs_eofblocks eofb;
+ struct xfs_icwalk icw;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return -EROFS;
if (copy_from_user(&eofb, arg, sizeof(eofb)))
return -EFAULT;
- error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
+ error = xfs_fs_eofblocks_from_user(&eofb, &icw);
if (error)
return error;
- return xfs_icache_free_eofblocks(mp, &keofb);
+ trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
+
+ sb_start_write(mp->m_super);
+ error = xfs_blockgc_free_space(mp, &icw);
+ sb_end_write(mp->m_super);
+ return error;
}
default:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 420bd95dc326..d4abba2c13c1 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -6,10 +6,9 @@
#ifndef __XFS_IOCTL_H__
#define __XFS_IOCTL_H__
-extern int
-xfs_ioc_space(
- struct file *filp,
- xfs_flock64_t *bf);
+struct xfs_bstat;
+struct xfs_ibulk;
+struct xfs_inogrp;
int
xfs_ioc_swapext(
@@ -30,27 +29,12 @@ xfs_readlink_by_handle(
struct file *parfilp,
xfs_fsop_handlereq_t *hreq);
-extern int
-xfs_attrmulti_attr_get(
- struct inode *inode,
- unsigned char *name,
- unsigned char __user *ubuf,
- uint32_t *len,
- uint32_t flags);
-
-extern int
-xfs_attrmulti_attr_set(
- struct inode *inode,
- unsigned char *name,
- const unsigned char __user *ubuf,
- uint32_t len,
- uint32_t flags);
-
-extern int
-xfs_attrmulti_attr_remove(
- struct inode *inode,
- unsigned char *name,
- uint32_t flags);
+int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
+ uint32_t opcode, void __user *uname, void __user *value,
+ uint32_t *len, uint32_t flags);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
+ size_t bufsize, int flags,
+ struct xfs_attrlist_cursor __user *ucursor);
extern struct dentry *
xfs_handle_to_dentry(
@@ -58,6 +42,17 @@ xfs_handle_to_dentry(
void __user *uhandle,
u32 hlen);
+extern int
+xfs_fileattr_get(
+ struct dentry *dentry,
+ struct fileattr *fa);
+
+extern int
+xfs_fileattr_set(
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct fileattr *fa);
+
extern long
xfs_file_ioctl(
struct file *filp,
@@ -70,10 +65,6 @@ xfs_file_compat_ioctl(
unsigned int cmd,
unsigned long arg);
-struct xfs_ibulk;
-struct xfs_bstat;
-struct xfs_inogrp;
-
int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
const struct xfs_bulkstat *bstat);
int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 769581a79c58..2f54b701eead 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -17,6 +17,8 @@
#include "xfs_itable.h"
#include "xfs_fsops.h"
#include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
#include "xfs_ioctl32.h"
@@ -28,29 +30,13 @@
#ifdef BROKEN_X86_ALIGNMENT
STATIC int
-xfs_compat_flock64_copyin(
- xfs_flock64_t *bf,
- compat_xfs_flock64_t __user *arg32)
-{
- if (get_user(bf->l_type, &arg32->l_type) ||
- get_user(bf->l_whence, &arg32->l_whence) ||
- get_user(bf->l_start, &arg32->l_start) ||
- get_user(bf->l_len, &arg32->l_len) ||
- get_user(bf->l_sysid, &arg32->l_sysid) ||
- get_user(bf->l_pid, &arg32->l_pid) ||
- copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
- return -EFAULT;
- return 0;
-}
-
-STATIC int
xfs_compat_ioc_fsgeometry_v1(
struct xfs_mount *mp,
compat_xfs_fsop_geom_v1_t __user *arg32)
{
struct xfs_fsop_geom fsgeo;
- xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
+ xfs_fs_geometry(mp, &fsgeo, 3);
/* The 32-bit variant simply has some padding at the end */
if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
return -EFAULT;
@@ -209,14 +195,16 @@ xfs_fsbulkstat_one_fmt_compat(
/* copied from xfs_ioctl.c */
STATIC int
xfs_compat_ioc_fsbulkstat(
- xfs_mount_t *mp,
+ struct file *file,
unsigned int cmd,
struct compat_xfs_fsop_bulkreq __user *p32)
{
+ struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
u32 addr;
struct xfs_fsop_bulkreq bulkreq;
struct xfs_ibulk breq = {
.mp = mp,
+ .mnt_userns = file_mnt_user_ns(file),
.ocount = 0,
};
xfs_ino_t lastino;
@@ -231,7 +219,7 @@ xfs_compat_ioc_fsbulkstat(
inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat;
bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat;
-#ifdef CONFIG_X86_X32
+#ifdef CONFIG_X86_X32_ABI
if (in_x32_syscall()) {
/*
* ... but on x32 the input xfs_fsop_bulkreq has pointers
@@ -252,7 +240,7 @@ xfs_compat_ioc_fsbulkstat(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
if (get_user(addr, &p32->lastip))
@@ -352,56 +340,24 @@ xfs_compat_handlereq_to_dentry(
STATIC int
xfs_compat_attrlist_by_handle(
struct file *parfilp,
- void __user *arg)
+ compat_xfs_fsop_attrlist_handlereq_t __user *p)
{
- int error;
- attrlist_cursor_kern_t *cursor;
- compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
compat_xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
- char *kbuf;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&al_hreq, arg,
- sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
return -EFAULT;
- if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XFS_XATTR_LIST_MAX)
- return -EINVAL;
-
- /*
- * Reject flags, only allow namespaces.
- */
- if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
- return -EINVAL;
dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- error = -ENOMEM;
- kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
- if (!kbuf)
- goto out_dput;
-
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
- al_hreq.flags, cursor);
- if (error)
- goto out_kfree;
-
- if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
- error = -EFAULT;
- goto out_kfree;
- }
-
- if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
- error = -EFAULT;
-
-out_kfree:
- kmem_free(kbuf);
-out_dput:
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)),
+ compat_ptr(al_hreq.buffer), al_hreq.buflen,
+ al_hreq.flags, &p->pos);
dput(dentry);
return error;
}
@@ -416,7 +372,6 @@ xfs_compat_attrmulti_by_handle(
compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
struct dentry *dentry;
unsigned int i, size;
- unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -443,64 +398,18 @@ xfs_compat_attrmulti_by_handle(
goto out_dput;
}
- error = -ENOMEM;
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
- if ((ops[i].am_flags & ATTR_ROOT) &&
- (ops[i].am_flags & ATTR_SECURE)) {
- ops[i].am_error = -EINVAL;
- continue;
- }
- ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
-
- ops[i].am_error = strncpy_from_user((char *)attr_name,
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
compat_ptr(ops[i].am_attrname),
- MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(
- d_inode(dentry), attr_name,
- compat_ptr(ops[i].am_attrvalue),
- &ops[i].am_length, ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_set(
- d_inode(dentry), attr_name,
- compat_ptr(ops[i].am_attrvalue),
- ops[i].am_length, ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_remove(
- d_inode(dentry), attr_name,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- ops[i].am_error = -EINVAL;
- }
+ compat_ptr(ops[i].am_attrvalue),
+ &ops[i].am_length, ops[i].am_flags);
}
if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
error = -EFAULT;
- kfree(attr_name);
- out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
@@ -515,7 +424,6 @@ xfs_file_compat_ioctl(
{
struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
void __user *arg = compat_ptr(p);
int error;
@@ -523,19 +431,8 @@ xfs_file_compat_ioctl(
switch (cmd) {
#if defined(BROKEN_X86_ALIGNMENT)
- case XFS_IOC_ALLOCSP_32:
- case XFS_IOC_FREESP_32:
- case XFS_IOC_ALLOCSP64_32:
- case XFS_IOC_FREESP64_32: {
- struct xfs_flock64 bf;
-
- if (xfs_compat_flock64_copyin(&bf, arg))
- return -EFAULT;
- cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(filp, &bf);
- }
case XFS_IOC_FSGEOMETRY_V1_32:
- return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+ return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg);
case XFS_IOC_FSGROWFSDATA_32: {
struct xfs_growfs_data in;
@@ -544,7 +441,7 @@ xfs_file_compat_ioctl(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_growfs_data(mp, &in);
+ error = xfs_growfs_data(ip->i_mount, &in);
mnt_drop_write_file(filp);
return error;
}
@@ -556,14 +453,12 @@ xfs_file_compat_ioctl(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_growfs_rt(mp, &in);
+ error = xfs_growfs_rt(ip->i_mount, &in);
mnt_drop_write_file(filp);
return error;
}
#endif
/* long changes size, but xfs only copiese out 32 bits */
- case XFS_IOC_GETXFLAGS_32:
- case XFS_IOC_SETXFLAGS_32:
case XFS_IOC_GETVERSION_32:
cmd = _NATIVE_IOC(cmd, long);
return xfs_file_ioctl(filp, cmd, p);
@@ -586,7 +481,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_FSBULKSTAT_32:
case XFS_IOC_FSBULKSTAT_SINGLE_32:
case XFS_IOC_FSINUMBERS_32:
- return xfs_compat_ioc_fsbulkstat(mp, cmd, arg);
+ return xfs_compat_ioc_fsbulkstat(filp, cmd, arg);
case XFS_IOC_FD_TO_HANDLE_32:
case XFS_IOC_PATH_TO_HANDLE_32:
case XFS_IOC_PATH_TO_FSHANDLE_32: {
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 053de7d894cd..c14852362fce 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -17,8 +17,6 @@
*/
/* stock kernel-level ioctls we support */
-#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS
-#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS
#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION
/*
@@ -144,28 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq {
_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
#ifdef BROKEN_X86_ALIGNMENT
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct compat_xfs_flock64 {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start __attribute__((packed));
- /* len == 0 means until end of file */
- __s64 l_len __attribute__((packed));
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-} compat_xfs_flock64_t;
-
-#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64)
-#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64)
-#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64)
-#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64)
-#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64)
-#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
-#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
-#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
-#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
-
typedef struct compat_xfs_fsop_geom_v1 {
__u32 blocksize; /* filesystem (data) block size */
__u32 rtextsize; /* realtime extent size */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index bb590a267a7f..07da03976ec1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -28,7 +28,6 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
-
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -54,7 +53,8 @@ xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
- u16 flags)
+ unsigned int mapping_flags,
+ u16 iomap_flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
@@ -71,16 +71,22 @@ xfs_bmbt_to_iomap(
iomap->type = IOMAP_DELALLOC;
} else {
iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
+ if (mapping_flags & IOMAP_DAX)
+ iomap->addr += target->bt_dax_part_off;
+
if (imap->br_state == XFS_EXT_UNWRITTEN)
iomap->type = IOMAP_UNWRITTEN;
else
iomap->type = IOMAP_MAPPED;
+
}
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = target->bt_bdev;
- iomap->dax_dev = target->bt_daxdev;
- iomap->flags = flags;
+ if (mapping_flags & IOMAP_DAX)
+ iomap->dax_dev = target->bt_daxdev;
+ else
+ iomap->bdev = target->bt_bdev;
+ iomap->flags = iomap_flags;
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
@@ -132,7 +138,7 @@ xfs_eof_alignment(
* If mounted with the "-o swalloc" option the alignment is
* increased from the strip unit size to the stripe width.
*/
- if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ if (mp->m_swidth && xfs_has_swalloc(mp))
align = mp->m_swidth;
else if (mp->m_dalign)
align = mp->m_dalign;
@@ -153,13 +159,13 @@ xfs_iomap_eof_align_last_fsb(
struct xfs_inode *ip,
xfs_fileoff_t end_fsb)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
xfs_extlen_t align = xfs_eof_alignment(ip);
struct xfs_bmbt_irec irec;
struct xfs_iext_cursor icur;
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(!xfs_need_iread_extents(ifp));
/*
* Always round up the allocation request to the extent hint boundary.
@@ -188,31 +194,29 @@ xfs_iomap_write_direct(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb,
+ unsigned int flags,
struct xfs_bmbt_irec *imap)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
xfs_filblks_t resaligned;
int nimaps;
- int quota_flag;
- uint qblocks, resblks;
- unsigned int resrtextents = 0;
+ unsigned int dblocks, rblocks;
+ bool force = false;
int error;
int bmapi_flags = XFS_BMAPI_PREALLOC;
- uint tflags = 0;
+ int nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT;
ASSERT(count_fsb > 0);
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
xfs_get_extsz_hint(ip));
if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
- resrtextents = qblocks = resaligned;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
} else {
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
}
error = xfs_qm_dqattach(ip);
@@ -232,26 +236,26 @@ xfs_iomap_write_direct(
* the reserve block pool for bmbt block allocation if there is no space
* left but we need to do unwritten extent conversion.
*/
- if (IS_DAX(VFS_I(ip))) {
+ if (flags & IOMAP_DAX) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (imap->br_state == XFS_EXT_UNWRITTEN) {
- tflags |= XFS_TRANS_RESERVE;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ force = true;
+ nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
- tflags, &tp);
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, force, &tp);
if (error)
return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, nr_exts);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
@@ -260,7 +264,7 @@ xfs_iomap_write_direct(
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
imap, &nimaps);
if (error)
- goto out_res_cancel;
+ goto out_trans_cancel;
/*
* Complete the transaction
@@ -284,8 +288,6 @@ out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
-out_res_cancel:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
xfs_trans_cancel(tp);
goto out_unlock;
@@ -293,11 +295,11 @@ out_trans_cancel:
STATIC bool
xfs_quota_need_throttle(
- struct xfs_inode *ip,
- int type,
- xfs_fsblock_t alloc_blocks)
+ struct xfs_inode *ip,
+ xfs_dqtype_t type,
+ xfs_fsblock_t alloc_blocks)
{
- struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
if (!dq || !xfs_this_quota_on(ip->i_mount, type))
return false;
@@ -307,7 +309,7 @@ xfs_quota_need_throttle(
return false;
/* under the lo watermark, no throttle */
- if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+ if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
return false;
return true;
@@ -315,24 +317,24 @@ xfs_quota_need_throttle(
STATIC void
xfs_quota_calc_throttle(
- struct xfs_inode *ip,
- int type,
- xfs_fsblock_t *qblocks,
- int *qshift,
- int64_t *qfreesp)
+ struct xfs_inode *ip,
+ xfs_dqtype_t type,
+ xfs_fsblock_t *qblocks,
+ int *qshift,
+ int64_t *qfreesp)
{
- int64_t freesp;
- int shift = 0;
- struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ int64_t freesp;
+ int shift = 0;
/* no dq, or over hi wmark, squash the prealloc completely */
- if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+ if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
*qblocks = 0;
*qfreesp = 0;
return;
}
- freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+ freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
shift = 2;
if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
@@ -352,22 +354,10 @@ xfs_quota_calc_throttle(
}
/*
- * If we are doing a write at the end of the file and there are no allocations
- * past this one, then extend the allocation out to the file system's write
- * iosize.
- *
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
- * filesystem is to full, the smaller the maximum prealocation.
- *
- * As an exception we don't do any preallocation at all if the file is smaller
- * than the minimum preallocation and we are using the default dynamic
- * preallocation scheme, as it is likely this is the only write to the file that
- * is going to be done.
- *
- * We clean up any extra space left over when the file is closed in
- * xfs_inactive().
+ * filesystem is to being full, the smaller the maximum preallocation.
*/
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
@@ -377,65 +367,72 @@ xfs_iomap_prealloc_size(
loff_t count,
struct xfs_iext_cursor *icur)
{
+ struct xfs_iext_cursor ncur = *icur;
+ struct xfs_bmbt_irec prev, got;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- struct xfs_bmbt_irec prev;
- int shift = 0;
int64_t freesp;
xfs_fsblock_t qblocks;
- int qshift = 0;
xfs_fsblock_t alloc_blocks = 0;
+ xfs_extlen_t plen;
+ int shift = 0;
+ int qshift = 0;
- if (offset + count <= XFS_ISIZE(ip))
- return 0;
-
- if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
- (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)))
+ /*
+ * As an exception we don't do any preallocation at all if the file is
+ * smaller than the minimum preallocation and we are using the default
+ * dynamic preallocation scheme, as it is likely this is the only write
+ * to the file that is going to be done.
+ */
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))
return 0;
/*
- * If an explicit allocsize is set, the file is small, or we
- * are writing behind a hole, then use the minimum prealloc:
+ * Use the minimum preallocation size for small files or if we are
+ * writing right after a hole.
*/
- if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) ||
- XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
- !xfs_iext_peek_prev_extent(ifp, icur, &prev) ||
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+ !xfs_iext_prev_extent(ifp, &ncur, &prev) ||
prev.br_startoff + prev.br_blockcount < offset_fsb)
return mp->m_allocsize_blocks;
/*
- * Determine the initial size of the preallocation. We are beyond the
- * current EOF here, but we need to take into account whether this is
- * a sparse write or an extending write when determining the
- * preallocation size. Hence we need to look up the extent that ends
- * at the current write offset and use the result to determine the
- * preallocation size.
- *
- * If the extent is a hole, then preallocation is essentially disabled.
- * Otherwise we take the size of the preceding data extent as the basis
- * for the preallocation size. If the size of the extent is greater than
- * half the maximum extent length, then use the current offset as the
- * basis. This ensures that for large files the preallocation size
- * always extends to MAXEXTLEN rather than falling short due to things
- * like stripe unit/width alignment of real extents.
+ * Take the size of the preceding data extents as the basis for the
+ * preallocation size. Note that we don't care if the previous extents
+ * are written or not.
*/
- if (prev.br_blockcount <= (MAXEXTLEN >> 1))
- alloc_blocks = prev.br_blockcount << 1;
- else
+ plen = prev.br_blockcount;
+ while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
+ if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
+ isnullstartblock(got.br_startblock) ||
+ got.br_startoff + got.br_blockcount != prev.br_startoff ||
+ got.br_startblock + got.br_blockcount != prev.br_startblock)
+ break;
+ plen += got.br_blockcount;
+ prev = got;
+ }
+
+ /*
+ * If the size of the extents is greater than half the maximum extent
+ * length, then use the current offset as the basis. This ensures that
+ * for large files the preallocation size always extends to
+ * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+ * unit/width alignment of real extents.
+ */
+ alloc_blocks = plen * 2;
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
- if (!alloc_blocks)
- goto check_writeio;
qblocks = alloc_blocks;
/*
- * MAXEXTLEN is not a power of two value but we round the prealloc down
- * to the nearest power of two value after throttling. To prevent the
- * round down from unconditionally reducing the maximum supported prealloc
- * size, we round up first, apply appropriate throttling, round down and
- * cap the value to MAXEXTLEN.
+ * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+ * down to the nearest power of two value after throttling. To prevent
+ * the round down from unconditionally reducing the maximum supported
+ * prealloc size, we round up first, apply appropriate throttling, round
+ * down and cap the value to XFS_BMBT_MAX_EXTLEN.
*/
- alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
freesp = percpu_counter_read_positive(&mp->m_fdblocks);
@@ -455,14 +452,14 @@ xfs_iomap_prealloc_size(
* Check each quota to cap the prealloc size, provide a shift value to
* throttle with and adjust amount of available space.
*/
- if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
&freesp);
- if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
&freesp);
- if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
&freesp);
/*
@@ -483,18 +480,17 @@ xfs_iomap_prealloc_size(
*/
if (alloc_blocks)
alloc_blocks = rounddown_pow_of_two(alloc_blocks);
- if (alloc_blocks > MAXEXTLEN)
- alloc_blocks = MAXEXTLEN;
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+ alloc_blocks = XFS_MAX_BMBT_EXTLEN;
/*
* If we are still trying to allocate more space than is
* available, squash the prealloc hard. This can happen if we
* have a large file on a small filesystem and the above
- * lowspace thresholds are smaller than MAXEXTLEN.
+ * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
-check_writeio:
if (alloc_blocks < mp->m_allocsize_blocks)
alloc_blocks = mp->m_allocsize_blocks;
trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
@@ -554,16 +550,16 @@ xfs_iomap_write_unwritten(
* here as we might be asked to write out the same inode that we
* complete here and might deadlock on the iolock.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
+ 0, true, &tp);
if (error)
return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
@@ -589,7 +585,7 @@ xfs_iomap_write_unwritten(
i_size_write(inode, i_size);
i_size = xfs_new_eof(ip, i_size);
if (i_size) {
- ip->i_d.di_size = i_size;
+ ip->i_disk_size = i_size;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -636,7 +632,7 @@ imap_needs_alloc(
imap->br_startblock == DELAYSTARTBLOCK)
return true;
/* we convert unwritten extents before copying the data for DAX */
- if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
+ if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
return true;
return false;
}
@@ -668,7 +664,7 @@ xfs_ilock_for_iomap(
unsigned flags,
unsigned *lockmode)
{
- unsigned mode = XFS_ILOCK_SHARED;
+ unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
@@ -683,7 +679,7 @@ xfs_ilock_for_iomap(
* is an opencoded xfs_ilock_data_map_shared() call but with
* non-blocking behaviour.
*/
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ if (xfs_need_iread_extents(&ip->i_df)) {
if (flags & IOMAP_NOWAIT)
return -EAGAIN;
mode = XFS_ILOCK_EXCL;
@@ -712,6 +708,23 @@ relock:
return 0;
}
+/*
+ * Check that the imap we are going to return to the caller spans the entire
+ * range that the caller requested for the IO.
+ */
+static bool
+imap_spans_range(
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ if (imap->br_startoff > offset_fsb)
+ return false;
+ if (imap->br_startoff + imap->br_blockcount < end_fsb)
+ return false;
+ return true;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -729,11 +742,11 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -760,7 +773,8 @@ xfs_direct_write_iomap_begin(
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -772,13 +786,38 @@ xfs_direct_write_iomap_begin(
if (imap_needs_alloc(inode, flags, &imap, nimaps))
goto allocate_blocks;
+ /*
+ * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
+ * a single map so that we avoid partial IO failures due to the rest of
+ * the I/O range not covered by this map triggering an EAGAIN condition
+ * when it is subsequently mapped and aborting the I/O.
+ */
+ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
+ error = -EAGAIN;
+ if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+ goto out_unlock;
+ }
+
+ /*
+ * For overwrite only I/O, we cannot convert unwritten extents without
+ * requiring sub-block zeroing. This can only be done under an
+ * exclusive IOLOCK, hence return -EAGAIN if this is not a written
+ * extent to tell the caller to try again.
+ */
+ if (flags & IOMAP_OVERWRITE_ONLY) {
+ error = -EAGAIN;
+ if (imap.br_state != XFS_EXT_NORM &&
+ ((offset | length) & mp->m_blockmask))
+ goto out_unlock;
+ }
+
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
allocate_blocks:
error = -EAGAIN;
- if (flags & IOMAP_NOWAIT)
+ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
goto out_unlock;
/*
@@ -800,26 +839,28 @@ allocate_blocks:
xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
- &imap);
+ flags, &imap);
if (error)
return error;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+ iomap_flags | IOMAP_F_NEW);
out_found_cow:
xfs_iunlock(ip, lockmode);
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
if (imap.br_startblock != HOLESTARTBLOCK) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
out_unlock:
- xfs_iunlock(ip, lockmode);
+ if (lockmode)
+ xfs_iunlock(ip, lockmode);
return error;
}
@@ -828,6 +869,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
};
static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (!xfs_is_cow_inode(ip))
+ return 0;
+
+ if (!written) {
+ xfs_reflink_cancel_cow_range(ip, pos, length, true);
+ return 0;
+ }
+
+ return xfs_reflink_end_cow(ip, pos, written);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
+static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -846,6 +914,10 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK;
int error = 0;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
@@ -854,9 +926,11 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
- if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) ||
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
error = -EFSCORRUPTED;
goto out_unlock;
@@ -864,14 +938,12 @@ xfs_buffered_write_iomap_begin(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
- }
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
/*
- * Search the data fork fork first to look up our source mapping. We
+ * Search the data fork first to look up our source mapping. We
* always need the data fork map, as we have to return it to the
* iomap code so that the higher level write code can read data in to
* perform read-modify-write cycles for unaligned writes.
@@ -961,9 +1033,16 @@ xfs_buffered_write_iomap_begin(
if (error)
goto out_unlock;
- if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset,
- count, &icur);
+ if (eof && offset + count > XFS_ISIZE(ip)) {
+ /*
+ * Determine the initial size of the preallocation.
+ * We clean up any extra preallocation when the file is closed.
+ */
+ if (xfs_has_allocsize(mp))
+ prealloc_blocks = mp->m_allocsize_blocks;
+ else
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
+ offset, count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -1001,7 +1080,7 @@ retry:
prealloc_blocks = 0;
goto retry;
}
- /*FALLTHRU*/
+ fallthrough;
default:
goto out_unlock;
}
@@ -1017,23 +1096,24 @@ retry:
*/
xfs_iunlock(ip, XFS_ILOCK_EXCL);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
found_imap:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
found_cow:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
- } else {
- xfs_trim_extent(&cmap, offset_fsb,
- imap.br_startoff - offset_fsb);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+ IOMAP_F_SHARED);
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+
+ xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1092,7 +1172,7 @@ xfs_buffered_write_iomap_end(
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
end_fsb - start_fsb);
- if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (error && !xfs_is_shutdown(mp)) {
xfs_alert(mp, "%s: unable to clean up ino %lld",
__func__, ip->i_ino);
return error;
@@ -1123,11 +1203,11 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
@@ -1142,7 +1222,8 @@ xfs_read_iomap_begin(
if (error)
return error;
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+ shared ? IOMAP_F_SHARED : 0);
}
const struct iomap_ops xfs_read_iomap_ops = {
@@ -1168,15 +1249,13 @@ xfs_seek_iomap_begin(
int error = 0;
unsigned lockmode;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
lockmode = xfs_ilock_data_map_shared(ip);
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
- }
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
/*
@@ -1203,7 +1282,8 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+ IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1225,7 +1305,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1252,18 +1332,18 @@ xfs_xattr_iomap_begin(
int nimaps = 1, error = 0;
unsigned lockmode;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) {
error = -ENOENT;
goto out_unlock;
}
- ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
+ ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
@@ -1272,9 +1352,40 @@ out_unlock:
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
.iomap_begin = xfs_xattr_iomap_begin,
};
+
+int
+xfs_zero_range(
+ struct xfs_inode *ip,
+ loff_t pos,
+ loff_t len,
+ bool *did_zero)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (IS_DAX(inode))
+ return dax_zero_range(inode, pos, len, did_zero,
+ &xfs_direct_write_iomap_ops);
+ return iomap_zero_range(inode, pos, len, did_zero,
+ &xfs_buffered_write_iomap_ops);
+}
+
+int
+xfs_truncate_page(
+ struct xfs_inode *ip,
+ loff_t pos,
+ bool *did_zero)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (IS_DAX(inode))
+ return dax_truncate_page(inode, pos, did_zero,
+ &xfs_direct_write_iomap_ops);
+ return iomap_truncate_page(inode, pos, did_zero,
+ &xfs_buffered_write_iomap_ops);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7d3703556d0e..c782e8c0479c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -12,13 +12,19 @@ struct xfs_inode;
struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
- xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
+ xfs_fileoff_t count_fsb, unsigned int flags,
+ struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
-int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *, u16);
+int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
+ struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
+ u16 iomap_flags);
+
+int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
+ bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -45,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 81f2f93caec0..2e10e1c66ad6 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -13,6 +13,8 @@
#include "xfs_inode.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
@@ -21,18 +23,20 @@
#include "xfs_dir2.h"
#include "xfs_iomap.h"
#include "xfs_error.h"
+#include "xfs_ioctl.h"
+#include "xfs_xattr.h"
-#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
#include <linux/iversion.h>
+#include <linux/fiemap.h>
/*
- * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * Directories have different lock order w.r.t. mmap_lock compared to regular
* files. This is due to readdir potentially triggering page faults on a user
* buffer inside filldir(), and this happens with the ilock on the directory
* held. For regular files, the lock order is the other way around - the
- * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * mmap_lock is taken during the page fault, and then we lock the ilock to do
* block mapping. Hence we need a different class for the directory ilock so
* that lockdep can tell them apart.
*/
@@ -50,10 +54,15 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name,
- strlen(xattr->name),
- xattr->value, xattr->value_len,
- ATTR_SECURE);
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_SECURE,
+ .name = xattr->name,
+ .namelen = strlen(xattr->name),
+ .value = xattr->value,
+ .valuelen = xattr->value_len,
+ };
+ error = xfs_attr_change(&args);
if (error < 0)
break;
}
@@ -66,9 +75,8 @@ xfs_initxattrs(
* these attrs can be journalled at inode creation time (along with the
* inode, of course, such that log replay can't cause these to be lost).
*/
-
-STATIC int
-xfs_init_security(
+int
+xfs_inode_init_security(
struct inode *inode,
struct inode *dir,
const struct qstr *qstr)
@@ -113,7 +121,7 @@ xfs_cleanup_inode(
/* Oh, the horror.
* If we can't add the ACL or we fail in
- * xfs_init_security we must back out.
+ * xfs_inode_init_security we must back out.
* ENOSPC can hit here, among other things.
*/
xfs_dentry_to_name(&teardown, dentry);
@@ -121,13 +129,45 @@ xfs_cleanup_inode(
xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
}
+/*
+ * Check to see if we are likely to need an extended attribute to be added to
+ * the inode we are about to allocate. This allows the attribute fork to be
+ * created during the inode allocation, reducing the number of transactions we
+ * need to do in this fast path.
+ *
+ * The security checks are optimistic, but not guaranteed. The two LSMs that
+ * require xattrs to be added here (selinux and smack) are also the only two
+ * LSMs that add a sb->s_security structure to the superblock. Hence if security
+ * is enabled and sb->s_security is set, we have a pretty good idea that we are
+ * going to be asked to add a security xattr immediately after allocating the
+ * xfs inode and instantiating the VFS inode.
+ */
+static inline bool
+xfs_create_need_xattr(
+ struct inode *dir,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl)
+{
+ if (acl)
+ return true;
+ if (default_acl)
+ return true;
+#if IS_ENABLED(CONFIG_SECURITY)
+ if (dir->i_sb->s_security)
+ return true;
+#endif
+ return false;
+}
+
+
STATIC int
xfs_generic_create(
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry,
umode_t mode,
dev_t rdev,
- bool tmpfile) /* unnamed file */
+ struct file *tmpfile) /* unnamed file */
{
struct inode *inode;
struct xfs_inode *ip = NULL;
@@ -156,20 +196,21 @@ xfs_generic_create(
goto out_free_acl;
if (!tmpfile) {
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev,
+ xfs_create_need_xattr(dir, default_acl, acl),
+ &ip);
} else {
- error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
+ error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip);
}
if (unlikely(error))
goto out_free_acl;
inode = VFS_I(ip);
- error = xfs_init_security(inode, dir, &dentry->d_name);
+ error = xfs_inode_init_security(inode, dir, &dentry->d_name);
if (unlikely(error))
goto out_cleanup_inode;
-#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -180,7 +221,6 @@ xfs_generic_create(
if (error)
goto out_cleanup_inode;
}
-#endif
xfs_setup_iops(ip);
@@ -194,17 +234,15 @@ xfs_generic_create(
* d_tmpfile can immediately set it back to zero.
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
+ d_tmpfile(tmpfile, inode);
} else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
out_free_acl:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ posix_acl_release(acl);
return error;
out_cleanup_inode:
@@ -217,31 +255,35 @@ xfs_generic_create(
STATIC int
xfs_vn_mknod(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t rdev)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
{
- return xfs_generic_create(dir, dentry, mode, rdev, false);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, NULL);
}
STATIC int
xfs_vn_create(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- bool flags)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ bool flags)
{
- return xfs_vn_mknod(dir, dentry, mode, 0);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, NULL);
}
STATIC int
xfs_vn_mkdir(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+ return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
+ NULL);
}
STATIC struct dentry *
@@ -351,16 +393,17 @@ xfs_vn_unlink(
* but still hashed. This is incompatible with case-insensitive
* mode, so invalidate (unhash) the dentry in CI-mode.
*/
- if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+ if (xfs_has_asciici(XFS_M(dir->i_sb)))
d_invalidate(dentry);
return 0;
}
STATIC int
xfs_vn_symlink(
- struct inode *dir,
- struct dentry *dentry,
- const char *symname)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
{
struct inode *inode;
struct xfs_inode *cip = NULL;
@@ -374,13 +417,13 @@ xfs_vn_symlink(
if (unlikely(error))
goto out;
- error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+ error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip);
if (unlikely(error))
goto out;
inode = VFS_I(cip);
- error = xfs_init_security(inode, dir, &dentry->d_name);
+ error = xfs_inode_init_security(inode, dir, &dentry->d_name);
if (unlikely(error))
goto out_cleanup_inode;
@@ -400,11 +443,12 @@ xfs_vn_symlink(
STATIC int
xfs_vn_rename(
- struct inode *odir,
- struct dentry *odentry,
- struct inode *ndir,
- struct dentry *ndentry,
- unsigned int flags)
+ struct user_namespace *mnt_userns,
+ struct inode *odir,
+ struct dentry *odentry,
+ struct inode *ndir,
+ struct dentry *ndentry,
+ unsigned int flags)
{
struct inode *new_inode = d_inode(ndentry);
int omode = 0;
@@ -428,8 +472,8 @@ xfs_vn_rename(
if (unlikely(error))
return error;
- return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
- XFS_I(ndir), &nname,
+ return xfs_rename(mnt_userns, XFS_I(odir), &oname,
+ XFS_I(d_inode(odentry)), XFS_I(ndir), &nname,
new_inode ? XFS_I(new_inode) : NULL, flags);
}
@@ -467,27 +511,6 @@ xfs_vn_get_link(
return ERR_PTR(error);
}
-STATIC const char *
-xfs_vn_get_link_inline(
- struct dentry *dentry,
- struct inode *inode,
- struct delayed_call *done)
-{
- struct xfs_inode *ip = XFS_I(inode);
- char *link;
-
- ASSERT(ip->i_df.if_flags & XFS_IFINLINE);
-
- /*
- * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
- * if_data is junk.
- */
- link = ip->i_df.if_u1.if_data;
- if (XFS_IS_CORRUPT(ip->i_mount, !link))
- return ERR_PTR(-EFSCORRUPTED);
- return link;
-}
-
static uint32_t
xfs_stat_blksize(
struct xfs_inode *ip)
@@ -499,7 +522,7 @@ xfs_stat_blksize(
* always return the realtime extent size.
*/
if (XFS_IS_REALTIME_INODE(ip))
- return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+ return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip));
/*
* Allow large block sizes to be reported to userspace programs if the
@@ -514,10 +537,10 @@ xfs_stat_blksize(
* default buffered I/O size, return that, otherwise return the compat
* default.
*/
- if (mp->m_flags & XFS_MOUNT_LARGEIO) {
+ if (xfs_has_large_iosize(mp)) {
if (mp->m_swidth)
- return mp->m_swidth << mp->m_sb.sb_blocklog;
- if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ return XFS_FSB_TO_B(mp, mp->m_swidth);
+ if (xfs_has_allocsize(mp))
return 1U << mp->m_allocsize_log;
}
@@ -526,6 +549,7 @@ xfs_stat_blksize(
STATIC int
xfs_vn_getattr(
+ struct user_namespace *mnt_userns,
const struct path *path,
struct kstat *stat,
u32 request_mask,
@@ -534,29 +558,30 @@ xfs_vn_getattr(
struct inode *inode = d_inode(path->dentry);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
trace_xfs_getattr(ip);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
stat->size = XFS_ISIZE(ip);
stat->dev = inode->i_sb->s_dev;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = inode->i_uid;
- stat->gid = inode->i_gid;
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
stat->ctime = inode->i_ctime;
- stat->blocks =
- XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+ stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
- if (ip->i_d.di_version == 3) {
+ if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
- stat->btime = ip->i_d.di_crtime;
+ stat->btime = ip->i_crtime;
}
}
@@ -564,11 +589,11 @@ xfs_vn_getattr(
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
*/
- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+ if (ip->i_diflags & XFS_DIFLAG_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
- if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
+ if (ip->i_diflags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
@@ -581,6 +606,16 @@ xfs_vn_getattr(
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
+ case S_IFREG:
+ if (request_mask & STATX_DIOALIGN) {
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
@@ -590,51 +625,21 @@ xfs_vn_getattr(
return 0;
}
-static void
-xfs_setattr_mode(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
- umode_t mode = iattr->ia_mode;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
-}
-
-void
-xfs_setattr_time(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (iattr->ia_valid & ATTR_ATIME)
- inode->i_atime = iattr->ia_atime;
- if (iattr->ia_valid & ATTR_CTIME)
- inode->i_ctime = iattr->ia_ctime;
- if (iattr->ia_valid & ATTR_MTIME)
- inode->i_mtime = iattr->ia_mtime;
-}
-
static int
xfs_vn_change_ok(
- struct dentry *dentry,
- struct iattr *iattr)
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct iattr *iattr)
{
struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (xfs_is_readonly(mp))
return -EROFS;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
- return setattr_prepare(dentry, iattr);
+ return setattr_prepare(mnt_userns, dentry, iattr);
}
/*
@@ -643,21 +648,21 @@ xfs_vn_change_ok(
* Caution: The caller of this function is responsible for calling
* setattr_prepare() or otherwise verifying the change is fine.
*/
-int
+static int
xfs_setattr_nonsize(
+ struct user_namespace *mnt_userns,
struct xfs_inode *ip,
- struct iattr *iattr,
- int flags)
+ struct iattr *iattr)
{
xfs_mount_t *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
int mask = iattr->ia_valid;
xfs_trans_t *tp;
int error;
- kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
- kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
+ kuid_t uid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID;
struct xfs_dquot *udqp = NULL, *gdqp = NULL;
- struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
+ struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL;
ASSERT((mask & ATTR_SIZE) == 0);
@@ -673,13 +678,15 @@ xfs_setattr_nonsize(
uint qflags = 0;
if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
- uid = iattr->ia_uid;
+ uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
qflags |= XFS_QMOPT_UQUOTA;
} else {
uid = inode->i_uid;
}
if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
- gid = iattr->ia_gid;
+ gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
qflags |= XFS_QMOPT_GQUOTA;
} else {
gid = inode->i_gid;
@@ -692,114 +699,50 @@ xfs_setattr_nonsize(
*/
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
- error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
- xfs_kgid_to_gid(gid),
- ip->i_d.di_projid,
+ error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_projid,
qflags, &udqp, &gdqp, NULL);
if (error)
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
- * Change file ownership. Must be the owner or privileged.
+ * Register quota modifications in the transaction. Must be the owner
+ * or privileged. These IDs could have changed since we last looked at
+ * them. But, we're assured that if the ownership did change while we
+ * didn't have the inode locked, inode's dquot(s) would have changed
+ * also.
*/
- if (mask & (ATTR_UID|ATTR_GID)) {
- /*
- * These IDs could have changed since we last looked at them.
- * But, we're assured that if the ownership did change
- * while we didn't have the inode locked, inode's dquot(s)
- * would have changed also.
- */
- iuid = inode->i_uid;
- igid = inode->i_gid;
- gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
- uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
-
- /*
- * Do a quota reservation only if uid/gid is actually
- * going to change.
- */
- if (XFS_IS_QUOTA_RUNNING(mp) &&
- ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
- (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
- ASSERT(tp);
- error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- NULL, capable(CAP_FOWNER) ?
- XFS_QMOPT_FORCE_RES : 0);
- if (error) /* out of quota */
- goto out_cancel;
- }
+ if (XFS_IS_UQUOTA_ON(mp) &&
+ i_uid_needs_update(mnt_userns, iattr, inode)) {
+ ASSERT(udqp);
+ old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
}
-
- /*
- * Change file ownership. Must be the owner or privileged.
- */
- if (mask & (ATTR_UID|ATTR_GID)) {
- /*
- * CAP_FSETID overrides the following restrictions:
- *
- * The set-user-ID and set-group-ID bits of a file will be
- * cleared upon successful return from chown()
- */
- if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
- !capable(CAP_FSETID))
- inode->i_mode &= ~(S_ISUID|S_ISGID);
-
- /*
- * Change the ownerships and register quota modifications
- * in the transaction.
- */
- if (!uid_eq(iuid, uid)) {
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(mask & ATTR_UID);
- ASSERT(udqp);
- olddquot1 = xfs_qm_vop_chown(tp, ip,
- &ip->i_udquot, udqp);
- }
- ip->i_d.di_uid = xfs_kuid_to_uid(uid);
- inode->i_uid = uid;
- }
- if (!gid_eq(igid, gid)) {
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
- !XFS_IS_PQUOTA_ON(mp));
- ASSERT(mask & ATTR_GID);
- ASSERT(gdqp);
- olddquot2 = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
- }
- ip->i_d.di_gid = xfs_kgid_to_gid(gid);
- inode->i_gid = gid;
- }
+ if (XFS_IS_GQUOTA_ON(mp) &&
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
+ ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
+ ASSERT(gdqp);
+ old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
}
- if (mask & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
/*
* Release any dquot(s) the inode had kept before chown.
*/
- xfs_qm_dqrele(olddquot1);
- xfs_qm_dqrele(olddquot2);
+ xfs_qm_dqrele(old_udqp);
+ xfs_qm_dqrele(old_gdqp);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -813,39 +756,20 @@ xfs_setattr_nonsize(
* to attr_set. No previous user of the generic
* Posix ACL code seems to care about this issue either.
*/
- if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
- error = posix_acl_chmod(inode, inode->i_mode);
+ if (mask & ATTR_MODE) {
+ error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
if (error)
return error;
}
return 0;
-out_cancel:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
return error;
}
-int
-xfs_vn_setattr_nonsize(
- struct dentry *dentry,
- struct iattr *iattr)
-{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
- int error;
-
- trace_xfs_setattr(ip);
-
- error = xfs_vn_change_ok(dentry, iattr);
- if (error)
- return error;
- return xfs_setattr_nonsize(ip, iattr, 0);
-}
-
/*
* Truncate file. Must have write permission and not be a directory.
*
@@ -854,6 +778,7 @@ xfs_vn_setattr_nonsize(
*/
STATIC int
xfs_setattr_size(
+ struct user_namespace *mnt_userns,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -869,7 +794,7 @@ xfs_setattr_size(
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
@@ -877,7 +802,7 @@ xfs_setattr_size(
/*
* Short circuit the truncate case for zero length files.
*/
- if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
+ if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) {
if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
return 0;
@@ -885,7 +810,7 @@ xfs_setattr_size(
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(ip, iattr, 0);
+ return xfs_setattr_nonsize(mnt_userns, ip, iattr);
}
/*
@@ -912,11 +837,20 @@ xfs_setattr_size(
*/
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
- error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, oldsize, newsize - oldsize,
+ &did_zeroing);
} else {
- error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_buffered_write_iomap_ops);
+ /*
+ * iomap won't detect a dirty page over an unwritten block (or a
+ * cow block over a hole) and subsequently skips zeroing the
+ * newly post-EOF portion of the page. Flush the new EOF to
+ * convert the block before the pagecache truncate.
+ */
+ error = filemap_write_and_wait_range(inode->i_mapping, newsize,
+ newsize);
+ if (error)
+ return error;
+ error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
if (error)
@@ -940,8 +874,8 @@ xfs_setattr_size(
* operation.
*
* And we update in-core i_size and truncate page cache beyond newsize
- * before writeback the [di_size, newsize] range, so we're guaranteed
- * not to write stale data past the new EOF on truncate down.
+ * before writeback the [i_disk_size, newsize] range, so we're
+ * guaranteed not to write stale data past the new EOF on truncate down.
*/
truncate_setsize(inode, newsize);
@@ -954,9 +888,9 @@ xfs_setattr_size(
* otherwise those blocks may not be zeroed after a crash.
*/
if (did_zeroing ||
- (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
+ (newsize > ip->i_disk_size && oldsize != ip->i_disk_size)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- ip->i_d.di_size, newsize - 1);
+ ip->i_disk_size, newsize - 1);
if (error)
return error;
}
@@ -998,7 +932,7 @@ xfs_setattr_size(
* permanent before actually freeing any blocks it doesn't matter if
* they get written to.
*/
- ip->i_d.di_size = newsize;
+ ip->i_disk_size = newsize;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (newsize <= oldsize) {
@@ -1019,16 +953,13 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- if (iattr->ia_valid & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (xfs_has_wsync(mp))
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
@@ -1044,6 +975,7 @@ out_trans_cancel:
int
xfs_vn_setattr_size(
+ struct user_namespace *mnt_userns,
struct dentry *dentry,
struct iattr *iattr)
{
@@ -1052,22 +984,23 @@ xfs_vn_setattr_size(
trace_xfs_setattr(ip);
- error = xfs_vn_change_ok(dentry, iattr);
+ error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(ip, iattr);
+ return xfs_setattr_size(mnt_userns, ip, iattr);
}
STATIC int
xfs_vn_setattr(
+ struct user_namespace *mnt_userns,
struct dentry *dentry,
struct iattr *iattr)
{
+ struct inode *inode = d_inode(dentry);
+ struct xfs_inode *ip = XFS_I(inode);
int error;
if (iattr->ia_valid & ATTR_SIZE) {
- struct inode *inode = d_inode(dentry);
- struct xfs_inode *ip = XFS_I(inode);
uint iolock;
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
@@ -1079,10 +1012,14 @@ xfs_vn_setattr(
return error;
}
- error = xfs_vn_setattr_size(dentry, iattr);
+ error = xfs_vn_setattr_size(mnt_userns, dentry, iattr);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else {
- error = xfs_vn_setattr_nonsize(dentry, iattr);
+ trace_xfs_setattr(ip);
+
+ error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
+ if (!error)
+ error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
}
return error;
@@ -1153,11 +1090,14 @@ xfs_vn_fiemap(
STATIC int
xfs_vn_tmpfile(
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+ struct user_namespace *mnt_userns,
+ struct inode *dir,
+ struct file *file,
+ umode_t mode)
{
- return xfs_generic_create(dir, dentry, mode, 0, true);
+ int err = xfs_generic_create(mnt_userns, dir, file->f_path.dentry, mode, 0, file);
+
+ return finish_open_simple(file, err);
}
static const struct inode_operations xfs_inode_operations = {
@@ -1168,6 +1108,8 @@ static const struct inode_operations xfs_inode_operations = {
.listxattr = xfs_vn_listxattr,
.fiemap = xfs_vn_fiemap,
.update_time = xfs_vn_update_time,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
static const struct inode_operations xfs_dir_inode_operations = {
@@ -1193,6 +1135,8 @@ static const struct inode_operations xfs_dir_inode_operations = {
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
.tmpfile = xfs_vn_tmpfile,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1218,6 +1162,8 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
.tmpfile = xfs_vn_tmpfile,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
static const struct inode_operations xfs_symlink_inode_operations = {
@@ -1228,14 +1174,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.update_time = xfs_vn_update_time,
};
-static const struct inode_operations xfs_inline_symlink_inode_operations = {
- .get_link = xfs_vn_get_link_inline,
- .getattr = xfs_vn_getattr,
- .setattr = xfs_vn_setattr,
- .listxattr = xfs_vn_listxattr,
- .update_time = xfs_vn_update_time,
-};
-
/* Figure out if this file actually supports DAX. */
static bool
xfs_inode_supports_dax(
@@ -1243,13 +1181,12 @@ xfs_inode_supports_dax(
{
struct xfs_mount *mp = ip->i_mount;
- /* Only supported on non-reflinked files. */
- if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip))
+ /* Only supported on regular files. */
+ if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
- /* DAX mount option or DAX iflag must be set. */
- if (!(mp->m_flags & XFS_MOUNT_DAX) &&
- !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+ /* Only supported on non-reflinked files. */
+ if (xfs_is_reflink_inode(ip))
return false;
/* Block size must match page size */
@@ -1260,35 +1197,60 @@ xfs_inode_supports_dax(
return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
}
-STATIC void
+static bool
+xfs_inode_should_enable_dax(
+ struct xfs_inode *ip)
+{
+ if (!IS_ENABLED(CONFIG_FS_DAX))
+ return false;
+ if (xfs_has_dax_never(ip->i_mount))
+ return false;
+ if (!xfs_inode_supports_dax(ip))
+ return false;
+ if (xfs_has_dax_always(ip->i_mount))
+ return true;
+ if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
+ return true;
+ return false;
+}
+
+void
xfs_diflags_to_iflags(
- struct inode *inode,
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ bool init)
{
- uint16_t flags = ip->i_d.di_flags;
-
- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
- S_NOATIME | S_DAX);
-
- if (flags & XFS_DIFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- if (flags & XFS_DIFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- if (flags & XFS_DIFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- if (flags & XFS_DIFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- if (xfs_inode_supports_dax(ip))
- inode->i_flags |= S_DAX;
+ struct inode *inode = VFS_I(ip);
+ unsigned int xflags = xfs_ip2xflags(ip);
+ unsigned int flags = 0;
+
+ ASSERT(!(IS_DAX(inode) && init));
+
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ flags |= S_IMMUTABLE;
+ if (xflags & FS_XFLAG_APPEND)
+ flags |= S_APPEND;
+ if (xflags & FS_XFLAG_SYNC)
+ flags |= S_SYNC;
+ if (xflags & FS_XFLAG_NOATIME)
+ flags |= S_NOATIME;
+ if (init && xfs_inode_should_enable_dax(ip))
+ flags |= S_DAX;
+
+ /*
+ * S_DAX can only be set during inode initialization and is never set by
+ * the VFS, so we cannot mask off S_DAX in i_flags.
+ */
+ inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
+ inode->i_flags |= flags;
}
/*
* Initialize the Linux inode.
*
* When reading existing inodes from disk this is called directly from xfs_iget,
- * when creating a new inode it is called from xfs_ialloc after setting up the
- * inode. These callers have different criteria for clearing XFS_INEW, so leave
- * it up to the caller to deal with unlocking the inode appropriately.
+ * when creating a new inode it is called from xfs_init_new_inode after setting
+ * up the inode. These callers have different criteria for clearing XFS_INEW, so
+ * leave it up to the caller to deal with unlocking the inode appropriately.
*/
void
xfs_setup_inode(
@@ -1298,17 +1260,14 @@ xfs_setup_inode(
gfp_t gfp_mask;
inode->i_ino = ip->i_ino;
- inode->i_state = I_NEW;
+ inode->i_state |= I_NEW;
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
inode_fake_hash(inode);
- inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
- inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
-
- i_size_write(inode, ip->i_d.di_size);
- xfs_diflags_to_iflags(inode, ip);
+ i_size_write(inode, ip->i_disk_size);
+ xfs_diflags_to_iflags(ip, true);
if (S_ISDIR(inode->i_mode)) {
/*
@@ -1336,7 +1295,7 @@ xfs_setup_inode(
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
- if (!XFS_IFORK_Q(ip)) {
+ if (!xfs_inode_has_attr_fork(ip)) {
inode_has_no_xattr(inode);
cache_no_acl(inode);
}
@@ -1358,17 +1317,14 @@ xfs_setup_iops(
inode->i_mapping->a_ops = &xfs_address_space_operations;
break;
case S_IFDIR:
- if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+ if (xfs_has_asciici(XFS_M(inode->i_sb)))
inode->i_op = &xfs_dir_ci_inode_operations;
else
inode->i_op = &xfs_dir_inode_operations;
inode->i_fop = &xfs_dir_file_operations;
break;
case S_IFLNK:
- if (ip->i_df.if_flags & XFS_IFINLINE)
- inode->i_op = &xfs_inline_symlink_inode_operations;
- else
- inode->i_op = &xfs_symlink_inode_operations;
+ inode->i_op = &xfs_symlink_inode_operations;
break;
default:
inode->i_op = &xfs_inode_operations;
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 4d24ff309f59..e570dcb5df8d 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -13,15 +13,10 @@ extern const struct file_operations xfs_dir_file_operations;
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-/*
- * Internal setattr interfaces.
- */
-#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
+int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *vap);
-extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
-extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
- int flags);
-extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap);
-extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap);
+int xfs_inode_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 4b31c29b7e6b..a1c2bcf65d37 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -19,6 +19,7 @@
#include "xfs_error.h"
#include "xfs_icache.h"
#include "xfs_health.h"
+#include "xfs_trans.h"
/*
* Bulk Stat
@@ -54,15 +55,19 @@ struct xfs_bstat_chunk {
STATIC int
xfs_bulkstat_one_int(
struct xfs_mount *mp,
+ struct user_namespace *mnt_userns,
struct xfs_trans *tp,
xfs_ino_t ino,
struct xfs_bstat_chunk *bc)
{
- struct xfs_icdinode *dic; /* dinode core info pointer */
+ struct user_namespace *sb_userns = mp->m_super->s_user_ns;
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
struct xfs_bulkstat *buf = bc->buf;
+ xfs_extnum_t nextents;
int error = -EINVAL;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
if (xfs_internal_inum(mp, ino))
goto out_advance;
@@ -78,17 +83,17 @@ xfs_bulkstat_one_int(
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);
-
- dic = &ip->i_d;
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_projectid = ip->i_d.di_projid;
+ buf->bs_projectid = ip->i_projid;
buf->bs_ino = ino;
- buf->bs_uid = dic->di_uid;
- buf->bs_gid = dic->di_gid;
- buf->bs_size = dic->di_size;
+ buf->bs_uid = from_kuid(sb_userns, vfsuid_into_kuid(vfsuid));
+ buf->bs_gid = from_kgid(sb_userns, vfsgid_into_kgid(vfsgid));
+ buf->bs_size = ip->i_disk_size;
buf->bs_nlink = inode->i_nlink;
buf->bs_atime = inode->i_atime.tv_sec;
@@ -97,25 +102,31 @@ xfs_bulkstat_one_int(
buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
buf->bs_ctime = inode->i_ctime.tv_sec;
buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
- buf->bs_btime = dic->di_crtime.tv_sec;
- buf->bs_btime_nsec = dic->di_crtime.tv_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
buf->bs_xflags = xfs_ip2xflags(ip);
- buf->bs_extsize_blks = dic->di_extsize;
- buf->bs_extents = dic->di_nextents;
+ buf->bs_extsize_blks = ip->i_extsize;
+
+ nextents = xfs_ifork_nextents(&ip->i_df);
+ if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+ buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+ else
+ buf->bs_extents64 = nextents;
+
xfs_bulkstat_health(ip, buf);
- buf->bs_aextents = dic->di_anextents;
- buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+ buf->bs_aextents = xfs_ifork_nextents(&ip->i_af);
+ buf->bs_forkoff = xfs_inode_fork_boff(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
- if (dic->di_version == 3) {
- if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
- buf->bs_cowextsize_blks = dic->di_cowextsize;
+ if (xfs_has_v3inodes(mp)) {
+ buf->bs_btime = ip->i_crtime.tv_sec;
+ buf->bs_btime_nsec = ip->i_crtime.tv_nsec;
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+ buf->bs_cowextsize_blks = ip->i_cowextsize;
}
- switch (dic->di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_DEV:
buf->bs_rdev = sysv_encode_dev(inode->i_rdev);
buf->bs_blksize = BLKDEV_IOSIZE;
@@ -130,7 +141,7 @@ xfs_bulkstat_one_int(
case XFS_DINODE_FMT_BTREE:
buf->bs_rdev = 0;
buf->bs_blksize = mp->m_sb.sb_blocksize;
- buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
+ buf->bs_blocks = ip->i_nblocks + ip->i_delayed_blks;
break;
}
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -164,8 +175,15 @@ xfs_bulkstat_one(
.formatter = formatter,
.breq = breq,
};
+ struct xfs_trans *tp;
int error;
+ if (breq->mnt_userns != &init_user_ns) {
+ xfs_warn_ratelimited(breq->mp,
+ "bulkstat not supported inside of idmapped mounts.");
+ return -EINVAL;
+ }
+
ASSERT(breq->icount == 1);
bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
@@ -173,8 +191,18 @@ xfs_bulkstat_one(
if (!bc.buf)
return -ENOMEM;
- error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc);
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(breq->mp, &tp);
+ if (error)
+ goto out;
+ error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp,
+ breq->startino, &bc);
+ xfs_trans_cancel(tp);
+out:
kmem_free(bc.buf);
/*
@@ -194,9 +222,10 @@ xfs_bulkstat_iwalk(
xfs_ino_t ino,
void *data)
{
+ struct xfs_bstat_chunk *bc = data;
int error;
- error = xfs_bulkstat_one_int(mp, tp, ino, data);
+ error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data);
/* bulkstat just skips over missing inodes */
if (error == -ENOENT || error == -EINVAL)
return 0;
@@ -237,8 +266,15 @@ xfs_bulkstat(
.formatter = formatter,
.breq = breq,
};
+ struct xfs_trans *tp;
+ unsigned int iwalk_flags = 0;
int error;
+ if (breq->mnt_userns != &init_user_ns) {
+ xfs_warn_ratelimited(breq->mp,
+ "bulkstat not supported inside of idmapped mounts.");
+ return -EINVAL;
+ }
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
@@ -247,9 +283,21 @@ xfs_bulkstat(
if (!bc.buf)
return -ENOMEM;
- error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags,
- xfs_bulkstat_iwalk, breq->icount, &bc);
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(breq->mp, &tp);
+ if (error)
+ goto out;
+ if (breq->flags & XFS_IBULK_SAME_AG)
+ iwalk_flags |= XFS_IWALK_SAME_AG;
+
+ error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
+ xfs_bulkstat_iwalk, breq->icount, &bc);
+ xfs_trans_cancel(tp);
+out:
kmem_free(bc.buf);
/*
@@ -362,13 +410,24 @@ xfs_inumbers(
.formatter = formatter,
.breq = breq,
};
+ struct xfs_trans *tp;
int error = 0;
if (xfs_bulkstat_already_done(breq->mp, breq->startino))
return 0;
- error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags,
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(breq->mp, &tp);
+ if (error)
+ goto out;
+
+ error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
xfs_inumbers_walk, breq->icount, &ic);
+ xfs_trans_cancel(tp);
+out:
/*
* We found some inode groups, so clear the error status and return
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 96a1e2a9be3f..e2d0eba43f35 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -8,6 +8,7 @@
/* In-memory representation of a userspace request for batch inode data. */
struct xfs_ibulk {
struct xfs_mount *mp;
+ struct user_namespace *mnt_userns;
void __user *ubuffer; /* user output buffer */
xfs_ino_t startino; /* start with this inode */
unsigned int icount; /* number of elements in ubuffer */
@@ -16,7 +17,10 @@ struct xfs_ibulk {
};
/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
+#define XFS_IBULK_SAME_AG (1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64 (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
new file mode 100644
index 000000000000..43005ce8bd48
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+struct kmem_cache *xfs_iunlink_cache;
+
+static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_iunlink_item, item);
+}
+
+static void
+xfs_iunlink_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_iunlink_item *iup = IUL_ITEM(lip);
+
+ xfs_perag_put(iup->pag);
+ kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip));
+}
+
+
+static uint64_t
+xfs_iunlink_item_sort(
+ struct xfs_log_item *lip)
+{
+ return IUL_ITEM(lip)->ip->i_ino;
+}
+
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+xfs_iunlink_log_dinode(
+ struct xfs_trans *tp,
+ struct xfs_iunlink_item *iup)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip = iup->ip;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ int offset;
+ int error;
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+ if (error)
+ return error;
+ /*
+ * Don't log the unlinked field on stale buffers as this may be the
+ * transaction that frees the inode cluster and relogging the buffer
+ * here will incorrectly remove the stale state.
+ */
+ if (ibp->b_flags & XBF_STALE)
+ goto out;
+
+ dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+ /* Make sure the old pointer isn't garbage. */
+ if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
+ XFS_INO_TO_AGINO(mp, ip->i_ino),
+ be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
+ offset = ip->i_imap.im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * On precommit, we grab the inode cluster buffer for the inode number we were
+ * passed, then update the next unlinked field for that inode in the buffer and
+ * log the buffer. This ensures that the inode cluster buffer was logged in the
+ * correct order w.r.t. other inode cluster buffers. We can then remove the
+ * iunlink item from the transaction and release it as it is has now served it's
+ * purpose.
+ */
+static int
+xfs_iunlink_item_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_iunlink_item *iup = IUL_ITEM(lip);
+ int error;
+
+ error = xfs_iunlink_log_dinode(tp, iup);
+ list_del(&lip->li_trans);
+ xfs_iunlink_item_release(lip);
+ return error;
+}
+
+static const struct xfs_item_ops xfs_iunlink_item_ops = {
+ .iop_release = xfs_iunlink_item_release,
+ .iop_sort = xfs_iunlink_item_sort,
+ .iop_precommit = xfs_iunlink_item_precommit,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+int
+xfs_iunlink_log_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_iunlink_item *iup;
+
+ ASSERT(xfs_verify_agino_or_null(pag, next_agino));
+ ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked));
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ if (ip->i_next_unlinked == next_agino) {
+ if (next_agino != NULLAGINO)
+ return -EFSCORRUPTED;
+ return 0;
+ }
+
+ iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK,
+ &xfs_iunlink_item_ops);
+
+ iup->ip = ip;
+ iup->next_agino = next_agino;
+ iup->old_agino = ip->i_next_unlinked;
+
+ atomic_inc(&pag->pag_ref);
+ iup->pag = pag;
+
+ xfs_trans_add_item(tp, &iup->item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &iup->item.li_flags);
+ return 0;
+}
+
diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h
new file mode 100644
index 000000000000..c793cdcaccde
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H 1
+
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_perag;
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+ struct xfs_log_item item;
+ struct xfs_inode *ip;
+ struct xfs_perag *pag;
+ xfs_agino_t next_agino;
+ xfs_agino_t old_agino;
+};
+
+extern struct kmem_cache *xfs_iunlink_cache;
+
+int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_perag *pag, xfs_agino_t next_agino);
+
+#endif /* XFS_IUNLINK_ITEM_H */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 233dcc8784db..7558486f4937 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -21,6 +21,7 @@
#include "xfs_health.h"
#include "xfs_trans.h"
#include "xfs_pwork.h"
+#include "xfs_ag.h"
/*
* Walking Inodes in the Filesystem
@@ -51,10 +52,14 @@ struct xfs_iwalk_ag {
struct xfs_mount *mp;
struct xfs_trans *tp;
+ struct xfs_perag *pag;
/* Where do we start the traversal? */
xfs_ino_t startino;
+ /* What was the last inode number we saw when iterating the inobt? */
+ xfs_ino_t lastino;
+
/* Array of inobt records we cache. */
struct xfs_inobt_rec_incore *recs;
@@ -78,6 +83,9 @@ struct xfs_iwalk_ag {
/* Skip empty inobt records? */
unsigned int skip_empty:1;
+
+ /* Drop the (hopefully empty) transaction when calling iwalk_fn. */
+ unsigned int drop_trans:1;
};
/*
@@ -87,7 +95,7 @@ struct xfs_iwalk_ag {
STATIC void
xfs_iwalk_ichunk_ra(
struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_inobt_rec_incore *irec)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
@@ -103,7 +111,7 @@ xfs_iwalk_ichunk_ra(
imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
if (imask & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, agno, agbno,
+ xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
igeo->blocks_per_cluster,
&xfs_inode_buf_ops);
}
@@ -171,26 +179,25 @@ xfs_iwalk_free(
/* For each inuse inode in each cached inobt record, call our function. */
STATIC int
xfs_iwalk_ag_recs(
- struct xfs_iwalk_ag *iwag)
+ struct xfs_iwalk_ag *iwag)
{
- struct xfs_mount *mp = iwag->mp;
- struct xfs_trans *tp = iwag->tp;
- xfs_ino_t ino;
- unsigned int i, j;
- xfs_agnumber_t agno;
- int error;
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_perag *pag = iwag->pag;
+ xfs_ino_t ino;
+ unsigned int i, j;
+ int error;
- agno = XFS_INO_TO_AGNO(mp, iwag->startino);
for (i = 0; i < iwag->nr_recs; i++) {
struct xfs_inobt_rec_incore *irec = &iwag->recs[i];
- trace_xfs_iwalk_ag_rec(mp, agno, irec);
+ trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
if (xfs_pwork_want_abort(&iwag->pwork))
return 0;
if (iwag->inobt_walk_fn) {
- error = iwag->inobt_walk_fn(mp, tp, agno, irec,
+ error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
iwag->data);
if (error)
return error;
@@ -208,7 +215,8 @@ xfs_iwalk_ag_recs(
continue;
/* Otherwise call our function. */
- ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ irec->ir_startino + j);
error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
if (error)
return error;
@@ -254,7 +262,6 @@ xfs_iwalk_del_inobt(
STATIC int
xfs_iwalk_ag_start(
struct xfs_iwalk_ag *iwag,
- xfs_agnumber_t agno,
xfs_agino_t agino,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp,
@@ -262,12 +269,13 @@ xfs_iwalk_ag_start(
{
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
+ struct xfs_perag *pag = iwag->pag;
struct xfs_inobt_rec_incore *irec;
int error;
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp);
if (error)
return error;
@@ -301,6 +309,9 @@ xfs_iwalk_ag_start(
if (XFS_IS_CORRUPT(mp, *has_more != 1))
return -EFSCORRUPTED;
+ iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
+
/*
* If the LE lookup yielded an inobt record before the cursor position,
* skip it and see if there's another one after it.
@@ -339,23 +350,28 @@ out_advance:
STATIC int
xfs_iwalk_run_callbacks(
struct xfs_iwalk_ag *iwag,
- xfs_agnumber_t agno,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp,
int *has_more)
{
struct xfs_mount *mp = iwag->mp;
- struct xfs_trans *tp = iwag->tp;
struct xfs_inobt_rec_incore *irec;
- xfs_agino_t restart;
+ xfs_agino_t next_agino;
int error;
+ next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
+
ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */
- xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0);
irec = &iwag->recs[iwag->nr_recs - 1];
- restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+ ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
+
+ if (iwag->drop_trans) {
+ xfs_trans_cancel(iwag->tp);
+ iwag->tp = NULL;
+ }
error = xfs_iwalk_ag_recs(iwag);
if (error)
@@ -367,12 +383,19 @@ xfs_iwalk_run_callbacks(
if (!has_more)
return 0;
+ if (iwag->drop_trans) {
+ error = xfs_trans_alloc_empty(mp, &iwag->tp);
+ if (error)
+ return error;
+ }
+
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp,
+ agi_bpp);
if (error)
return error;
- return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+ return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
@@ -381,21 +404,21 @@ xfs_iwalk_ag(
struct xfs_iwalk_ag *iwag)
{
struct xfs_mount *mp = iwag->mp;
- struct xfs_trans *tp = iwag->tp;
+ struct xfs_perag *pag = iwag->pag;
struct xfs_buf *agi_bp = NULL;
struct xfs_btree_cur *cur = NULL;
- xfs_agnumber_t agno;
xfs_agino_t agino;
int has_more;
int error = 0;
/* Set up our cursor at the right place in the inode btree. */
- agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+ ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
agino = XFS_INO_TO_AGINO(mp, iwag->startino);
- error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+ error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
while (!error && has_more) {
struct xfs_inobt_rec_incore *irec;
+ xfs_ino_t rec_fsino;
cond_resched();
if (xfs_pwork_want_abort(&iwag->pwork))
@@ -407,6 +430,15 @@ xfs_iwalk_ag(
if (error || !has_more)
break;
+ /* Make sure that we always move forward. */
+ rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
+ if (iwag->lastino != NULLFSINO &&
+ XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
+
/* No allocated inodes in this chunk; skip it. */
if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
error = xfs_btree_increment(cur, 0, &has_more);
@@ -420,7 +452,7 @@ xfs_iwalk_ag(
* walking the inodes.
*/
if (iwag->iwalk_fn)
- xfs_iwalk_ichunk_ra(mp, agno, irec);
+ xfs_iwalk_ichunk_ra(mp, pag, irec);
/*
* If there's space in the buffer for more records, increment
@@ -440,18 +472,17 @@ xfs_iwalk_ag(
* we would be if we had been able to increment like above.
*/
ASSERT(has_more);
- error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
- &has_more);
+ error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
}
if (iwag->nr_recs == 0 || error)
goto out;
/* Walk the unprocessed records in the cache. */
- error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
+ error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
out:
- xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+ xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error);
return error;
}
@@ -535,7 +566,9 @@ xfs_iwalk(
.trim_start = 1,
.skip_empty = 1,
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
+ struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -546,15 +579,19 @@ xfs_iwalk(
if (error)
return error;
- for (; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
+ iwag.pag = pag;
error = xfs_iwalk_ag(&iwag);
if (error)
break;
iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
break;
+ iwag.pag = NULL;
}
+ if (iwag.pag)
+ xfs_perag_put(pag);
xfs_iwalk_free(&iwag);
return error;
}
@@ -575,10 +612,21 @@ xfs_iwalk_ag_work(
error = xfs_iwalk_alloc(iwag);
if (error)
goto out;
+ /*
+ * Grab an empty transaction so that we can use its recursive buffer
+ * locking abilities to detect cycles in the inobt without deadlocking.
+ */
+ error = xfs_trans_alloc_empty(mp, &iwag->tp);
+ if (error)
+ goto out;
+ iwag->drop_trans = 1;
error = xfs_iwalk_ag(iwag);
+ if (iwag->tp)
+ xfs_trans_cancel(iwag->tp);
xfs_iwalk_free(iwag);
out:
+ xfs_perag_put(iwag->pag);
kmem_free(iwag);
return error;
}
@@ -598,20 +646,18 @@ xfs_iwalk_threaded(
void *data)
{
struct xfs_pwork_ctl pctl;
+ struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
- unsigned int nr_threads;
int error;
ASSERT(agno < mp->m_sb.sb_agcount);
ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
- nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
- error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
- nr_threads);
+ error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
if (error)
return error;
- for (; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
struct xfs_iwalk_ag *iwag;
if (xfs_pwork_ctl_want_abort(&pctl))
@@ -619,16 +665,25 @@ xfs_iwalk_threaded(
iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
iwag->mp = mp;
+
+ /*
+ * perag is being handed off to async work, so take another
+ * reference for the async work to release.
+ */
+ atomic_inc(&pag->pag_ref);
+ iwag->pag = pag;
iwag->iwalk_fn = iwalk_fn;
iwag->data = data;
iwag->startino = startino;
iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+ iwag->lastino = NULLFSINO;
xfs_pwork_queue(&pctl, &iwag->pwork);
- startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
break;
}
-
+ if (pag)
+ xfs_perag_put(pag);
if (polled)
xfs_pwork_poll(&pctl);
return xfs_pwork_destroy(&pctl);
@@ -696,7 +751,9 @@ xfs_inobt_walk(
.startino = startino,
.sz_recs = xfs_inobt_walk_prefetch(inobt_records),
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
+ struct xfs_perag *pag;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -707,15 +764,19 @@ xfs_inobt_walk(
if (error)
return error;
- for (; agno < mp->m_sb.sb_agcount; agno++) {
+ for_each_perag_from(mp, agno, pag) {
+ iwag.pag = pag;
error = xfs_iwalk_ag(&iwag);
if (error)
break;
- iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
break;
+ iwag.pag = NULL;
}
+ if (iwag.pag)
+ xfs_perag_put(pag);
xfs_iwalk_free(&iwag);
return error;
}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 37a795f03267..83699089755e 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int inode_records, bool poll, void *data);
/* Only iterate inodes within the same AG as @startino. */
-#define XFS_IWALK_SAME_AG (0x1)
+#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 8738bb03f253..f9878021e7d0 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -60,6 +60,8 @@ typedef __u32 xfs_nlink_t;
#include <linux/list_sort.h>
#include <linux/ratelimit.h>
#include <linux/rhashtable.h>
+#include <linux/xattr.h>
+#include <linux/mnt_idmapping.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -97,16 +99,11 @@ typedef __u32 xfs_nlink_t;
#define xfs_rotorstep xfs_params.rotorstep.val
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
-#define xfs_eofb_secs xfs_params.eofb_timer.val
-#define xfs_cowb_secs xfs_params.cowb_timer.val
+#define xfs_blockgc_secs xfs_params.blockgc_timer.val
#define current_cpu() (raw_smp_processor_id())
-#define current_pid() (current->pid)
-#define current_test_flags(f) (current->flags & (f))
#define current_set_flags_nested(sp, f) \
(*(sp) = current->flags, current->flags |= (f))
-#define current_clear_flags_nested(sp, f) \
- (*(sp) = current->flags, current->flags &= ~(f))
#define current_restore_flags_nested(sp, f) \
(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
@@ -126,7 +123,6 @@ typedef __u32 xfs_nlink_t;
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
/*
@@ -163,32 +159,6 @@ struct xstats {
extern struct xstats xfsstats;
-/* Kernel uid/gid conversion. These are used to convert to/from the on disk
- * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
- * The conversion here is type only, the value will remain the same since we
- * are converting to the init_user_ns. The uid is later mapped to a particular
- * user namespace value when crossing the kernel/user boundary.
- */
-static inline uint32_t xfs_kuid_to_uid(kuid_t uid)
-{
- return from_kuid(&init_user_ns, uid);
-}
-
-static inline kuid_t xfs_uid_to_kuid(uint32_t uid)
-{
- return make_kuid(&init_user_ns, uid);
-}
-
-static inline uint32_t xfs_kgid_to_gid(kgid_t gid)
-{
- return from_kgid(&init_user_ns, gid);
-}
-
-static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
-{
- return make_kgid(&init_user_ns, gid);
-}
-
static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
{
return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
@@ -205,6 +175,12 @@ static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev)
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
+static inline uint64_t rounddown_64(uint64_t x, uint32_t y)
+{
+ do_div(x, y);
+ return x * y;
+}
+
static inline uint64_t roundup_64(uint64_t x, uint32_t y)
{
x += y - 1;
@@ -220,7 +196,7 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
}
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
- char *data, unsigned int op);
+ char *data, enum req_op op);
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
@@ -258,7 +234,7 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
* configured realtime device.
*/
#define XFS_IS_REALTIME_INODE(ip) \
- (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \
+ (((ip)->i_diflags & XFS_DIFLAG_REALTIME) && \
(ip)->i_mount->m_rtdev_targp)
#define XFS_IS_REALTIME_MOUNT(mp) ((mp)->m_rtdev_targp ? 1 : 0)
#else
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f6006d94a581..f02a0dd522b3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -21,16 +21,9 @@
#include "xfs_sb.h"
#include "xfs_health.h"
-kmem_zone_t *xfs_log_ticket_zone;
+struct kmem_cache *xfs_log_ticket_cache;
/* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp);
-
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
@@ -47,48 +40,27 @@ xlog_dealloc_log(
/* local state machine functions */
STATIC void xlog_state_done_syncing(
- struct xlog_in_core *iclog,
- bool aborted);
+ struct xlog_in_core *iclog);
+STATIC void xlog_state_do_callback(
+ struct xlog *log);
STATIC int
xlog_state_get_iclog_space(
struct xlog *log,
int len,
struct xlog_in_core **iclog,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp);
STATIC void
-xlog_state_switch_iclogs(
- struct xlog *log,
- struct xlog_in_core *iclog,
- int eventual_size);
-STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
-
-STATIC void
xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
-xlog_regrant_reserve_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
-xlog_ungrant_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog);
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
#if defined(DEBUG)
STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr);
-STATIC void
xlog_verify_grant_tail(
struct xlog *log);
STATIC void
@@ -99,19 +71,76 @@ xlog_verify_iclog(
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t tail_lsn);
+ struct xlog_in_core *iclog);
#else
-#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
-#define xlog_verify_tail_lsn(a,b,c)
+#define xlog_verify_tail_lsn(a,b)
#endif
STATIC int
xlog_iclogs_empty(
struct xlog *log);
+static int
+xfs_log_cover(struct xfs_mount *);
+
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+ struct xlog_op_header *oph;
+ uint32_t len;
+ void *buf;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+ lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ sizeof(struct xlog_op_header);
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ oph = vec->i_addr;
+ oph->oh_clientid = XFS_TRANSACTION;
+ oph->oh_res2 = 0;
+ oph->oh_flags = 0;
+
+ buf = vec->i_addr + sizeof(struct xlog_op_header);
+ ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return buf;
+}
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -197,12 +226,12 @@ xlog_ticket_reservation(
if (head == &log->l_write_head) {
ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
return tic->t_unit_res;
- } else {
- if (tic->t_flags & XLOG_TIC_PERM_RESERV)
- return tic->t_unit_res * tic->t_cnt;
- else
- return tic->t_unit_res;
}
+
+ if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+ return tic->t_unit_res * tic->t_cnt;
+
+ return tic->t_unit_res;
}
STATIC bool
@@ -265,7 +294,7 @@ xlog_grant_head_wait(
list_add_tail(&tic->t_queue, &head->waiters);
do {
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
goto shutdown;
xlog_grant_push_ail(log, need_bytes);
@@ -279,7 +308,7 @@ xlog_grant_head_wait(
trace_xfs_log_grant_wake(log, tic);
spin_lock(&head->lock);
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
goto shutdown;
} while (xlog_space_left(log, &head->grant) < need_bytes);
@@ -317,7 +346,7 @@ xlog_grant_head_check(
int free_bytes;
int error = 0;
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+ ASSERT(!xlog_in_recovery(log));
/*
* If there are other waiters on the queue then give them a chance at
@@ -344,28 +373,25 @@ xlog_grant_head_check(
return error;
}
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
+bool
+xfs_log_writable(
+ struct xfs_mount *mp)
{
- if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
- /* add to overflow and start again */
- tic->t_res_o_flow += tic->t_res_arr_sum;
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- }
-
- tic->t_res_arr[tic->t_res_num].r_len = len;
- tic->t_res_arr[tic->t_res_num].r_type = type;
- tic->t_res_arr_sum += len;
- tic->t_res_num++;
+ /*
+ * Do not write to the log on norecovery mounts, if the data or log
+ * devices are read-only, or if the filesystem is shutdown. Read-only
+ * mounts allow internal writes for log recovery and unmount purposes,
+ * so don't restrict that case.
+ */
+ if (xfs_has_norecovery(mp))
+ return false;
+ if (xfs_readonly_buftarg(mp->m_ddev_targp))
+ return false;
+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
+ return false;
+ if (xlog_is_shutdown(mp->m_log))
+ return false;
+ return true;
}
/*
@@ -380,7 +406,7 @@ xfs_log_regrant(
int need_bytes;
int error = 0;
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
@@ -396,8 +422,6 @@ xfs_log_regrant(
xlog_grant_push_ail(log, tic->t_unit_res);
tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
if (tic->t_cnt > 0)
return 0;
@@ -435,10 +459,9 @@ out_error:
int
xfs_log_reserve(
struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
+ int unit_bytes,
+ int cnt,
struct xlog_ticket **ticp,
- uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -446,15 +469,13 @@ xfs_log_reserve(
int need_bytes;
int error = 0;
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -484,138 +505,119 @@ out_error:
return error;
}
-
/*
- * NOTES:
+ * Run all the pending iclog callbacks and wake log force waiters and iclog
+ * space waiters so they can process the newly set shutdown state. We really
+ * don't care what order we process callbacks here because the log is shut down
+ * and so state cannot change on disk anymore. However, we cannot wake waiters
+ * until the callbacks have been processed because we may be in unmount and
+ * we must ensure that all AIL operations the callbacks perform have completed
+ * before we tear down the AIL.
*
- * 1. currblock field gets updated at startup and after in-core logs
- * marked as with WANT_SYNC.
+ * We avoid processing actively referenced iclogs so that we don't run callbacks
+ * while the iclog owner might still be preparing the iclog for IO submssion.
+ * These will be caught by xlog_state_iclog_release() and call this function
+ * again to process any callbacks that may have been added to that iclog.
*/
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation. If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket. If the ticket was one with a permanent reservation, then
- * a few operations are done differently. Permanent reservation tickets by
- * default don't release the reservation. They just commit the current
- * transaction with the belief that the reservation is still needed. A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again. By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
- struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant)
+static void
+xlog_state_shutdown_callbacks(
+ struct xlog *log)
{
- struct xlog *log = mp->m_log;
- xfs_lsn_t lsn = 0;
-
- if (XLOG_FORCED_SHUTDOWN(log) ||
- /*
- * If nothing was ever written, don't write out commit record.
- * If we get an error, just continue and give back the log ticket.
- */
- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
- lsn = (xfs_lsn_t) -1;
- regrant = false;
- }
-
-
- if (!regrant) {
- trace_xfs_log_done_nonperm(log, ticket);
-
- /*
- * Release ticket if not permanent reservation or a specific
- * request has been made to release a permanent reservation.
- */
- xlog_ungrant_log_space(log, ticket);
- } else {
- trace_xfs_log_done_perm(log, ticket);
-
- xlog_regrant_reserve_log_space(log, ticket);
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- ticket->t_flags |= XLOG_TIC_INITED;
- }
-
- xfs_log_ticket_put(ticket);
- return lsn;
-}
+ struct xlog_in_core *iclog;
+ LIST_HEAD(cb_list);
-static bool
-__xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- lockdep_assert_held(&log->l_icloglock);
+ iclog = log->l_iclog;
+ do {
+ if (atomic_read(&iclog->ic_refcnt)) {
+ /* Reference holder will re-run iclog callbacks. */
+ continue;
+ }
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
+ spin_unlock(&log->l_icloglock);
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
- /* update tail before writing to iclog */
- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+ xlog_cil_process_committed(&cb_list);
- iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
- xlog_verify_tail_lsn(log, iclog, tail_lsn);
- /* cycle incremented when incrementing curr_block */
- return true;
- }
+ spin_lock(&log->l_icloglock);
+ wake_up_all(&iclog->ic_write_wait);
+ wake_up_all(&iclog->ic_force_wait);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
- return false;
+ wake_up_all(&log->l_flush_wait);
}
/*
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
+ *
+ * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+ * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+ * written to stable storage, and implies that a commit record is contained
+ * within the iclog. We need to ensure that the log tail does not move beyond
+ * the tail that the first commit record in the iclog ordered against, otherwise
+ * correct recovery of that checkpoint becomes dependent on future operations
+ * performed on this iclog.
+ *
+ * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
+ * current tail into iclog. Once the iclog tail is set, future operations must
+ * not modify it, otherwise they potentially violate ordering constraints for
+ * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
+ * the iclog will get zeroed on activation of the iclog after sync, so we
+ * always capture the tail lsn on the iclog on the first NEED_FUA release
+ * regardless of the number of active reference counts on this iclog.
*/
-static int
+int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
- lockdep_assert_held(&log->l_icloglock);
+ xfs_lsn_t tail_lsn;
+ bool last_ref;
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- return -EIO;
+ lockdep_assert_held(&log->l_icloglock);
- if (atomic_dec_and_test(&iclog->ic_refcnt) &&
- __xlog_state_release_iclog(log, iclog)) {
- spin_unlock(&log->l_icloglock);
- xlog_sync(log, iclog);
- spin_lock(&log->l_icloglock);
+ trace_xlog_iclog_release(iclog, _RET_IP_);
+ /*
+ * Grabbing the current log tail needs to be atomic w.r.t. the writing
+ * of the tail LSN into the iclog so we guarantee that the log tail does
+ * not move between the first time we know that the iclog needs to be
+ * made stable and when we eventually submit it.
+ */
+ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
+ !iclog->ic_header.h_tail_lsn) {
+ tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
}
- return 0;
-}
+ last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
-int
-xfs_log_release_iclog(
- struct xfs_mount *mp,
- struct xlog_in_core *iclog)
-{
- struct xlog *log = mp->m_log;
- bool sync;
-
- if (iclog->ic_state == XLOG_STATE_IOERROR) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ if (xlog_is_shutdown(log)) {
+ /*
+ * If there are no more references to this iclog, process the
+ * pending iclog callbacks that were waiting on the release of
+ * this iclog.
+ */
+ if (last_ref)
+ xlog_state_shutdown_callbacks(log);
return -EIO;
}
- if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
- sync = __xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- if (sync)
- xlog_sync(log, iclog);
+ if (!last_ref)
+ return 0;
+
+ if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ return 0;
}
+
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ xlog_verify_tail_lsn(log, iclog);
+ trace_xlog_iclog_syncing(iclog, _RET_IP_);
+
+ spin_unlock(&log->l_icloglock);
+ xlog_sync(log, iclog, ticket);
+ spin_lock(&log->l_icloglock);
return 0;
}
@@ -636,25 +638,27 @@ xfs_log_mount(
xfs_daddr_t blk_offset,
int num_bblks)
{
- bool fatal = xfs_sb_version_hascrc(&mp->m_sb);
+ struct xlog *log;
+ bool fatal = xfs_has_crc(mp);
int error = 0;
int min_logfsbs;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ if (!xfs_has_norecovery(mp)) {
xfs_notice(mp, "Mounting V%d Filesystem",
XFS_SB_VERSION_NUM(&mp->m_sb));
} else {
xfs_notice(mp,
"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
XFS_SB_VERSION_NUM(&mp->m_sb));
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ ASSERT(xfs_is_readonly(mp));
}
- mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
- if (IS_ERR(mp->m_log)) {
- error = PTR_ERR(mp->m_log);
+ log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+ if (IS_ERR(log)) {
+ error = PTR_ERR(log);
goto out;
}
+ mp->m_log = log;
/*
* Validate the given log space and drop a critical message via syslog
@@ -719,51 +723,51 @@ xfs_log_mount(
xfs_warn(mp, "AIL initialisation failed: error %d", error);
goto out_free_log;
}
- mp->m_log->l_ailp = mp->m_ail;
+ log->l_ailp = mp->m_ail;
/*
* skip log recovery on a norecovery mount. pretend it all
* just worked.
*/
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
- int readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
- if (readonly)
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
-
- error = xlog_recover(mp->m_log);
-
+ if (!xfs_has_norecovery(mp)) {
+ /*
+ * log recovery ignores readonly state and so we need to clear
+ * mount-based read only state so it can write to disk.
+ */
+ bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
+ &mp->m_opstate);
+ error = xlog_recover(log);
if (readonly)
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
- xlog_recover_cancel(mp->m_log);
+ xlog_recover_cancel(log);
goto out_destroy_ail;
}
}
- error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+ error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
"log");
if (error)
goto out_destroy_ail;
/* Normal transactions can now occur */
- mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
/*
* Now the log has been fully initialised and we know were our
* space grant counters are, we can initialise the permanent ticket
* needed for delayed logging to work.
*/
- xlog_cil_init_post_recovery(mp->m_log);
+ xlog_cil_init_post_recovery(log);
return 0;
out_destroy_ail:
xfs_trans_ail_destroy(mp);
out_free_log:
- xlog_dealloc_log(mp->m_log);
+ xlog_dealloc_log(log);
out:
return error;
}
@@ -782,19 +786,22 @@ int
xfs_log_mount_finish(
struct xfs_mount *mp)
{
- int error = 0;
- bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
- bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED;
+ struct xlog *log = mp->m_log;
+ bool readonly;
+ int error = 0;
- if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ if (xfs_has_norecovery(mp)) {
+ ASSERT(xfs_is_readonly(mp));
return 0;
- } else if (readonly) {
- /* Allow unlinked processing to proceed */
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
}
/*
+ * log recovery ignores readonly state and so we need to clear
+ * mount-based read only state so it can write to disk.
+ */
+ readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+ /*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
@@ -815,9 +822,9 @@ xfs_log_mount_finish(
* mount failure occurs.
*/
mp->m_super->s_flags |= SB_ACTIVE;
- error = xlog_recover_finish(mp->m_log);
- if (!error)
- xfs_log_work_queue(mp);
+ xfs_log_work_queue(mp);
+ if (xlog_recovery_needed(log))
+ error = xlog_recover_finish(log);
mp->m_super->s_flags &= ~SB_ACTIVE;
evict_inodes(mp->m_super);
@@ -830,14 +837,24 @@ xfs_log_mount_finish(
* Don't push in the error case because the AIL may have pending intents
* that aren't removed until recovery is cancelled.
*/
- if (!error && recovered) {
- xfs_log_force(mp, XFS_LOG_SYNC);
- xfs_ail_push_all_sync(mp->m_ail);
+ if (xlog_recovery_needed(log)) {
+ if (!error) {
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_ail_push_all_sync(mp->m_ail);
+ }
+ xfs_notice(mp, "Ending recovery (logdev: %s)",
+ mp->m_logname ? mp->m_logname : "internal");
+ } else {
+ xfs_info(mp, "Ending clean mount");
}
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_drain(mp->m_ddev_targp);
+ clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
if (readonly)
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+ /* Make sure the log is dead if we're returning failure. */
+ ASSERT(!error || xlog_is_shutdown(log));
return error;
}
@@ -855,62 +872,114 @@ xfs_log_mount_cancel(
}
/*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens. Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Flush out the iclog to disk ensuring that device caches are flushed and
+ * the iclog hits stable storage before any completion waiters are woken.
*/
+static inline int
+xlog_force_iclog(
+ struct xlog_in_core *iclog)
+{
+ atomic_inc(&iclog->ic_refcnt);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+ return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
+}
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
- struct xfs_mount *mp)
+/*
+ * Wait for the iclog and all prior iclogs to be written disk as required by the
+ * log force state machine. Waiting on ic_force_wait ensures iclog completions
+ * have been ordered and callbacks run before we are woken here, hence
+ * guaranteeing that all the iclogs up to this one are on stable storage.
+ */
+int
+xlog_wait_on_iclog(
+ struct xlog_in_core *iclog)
+ __releases(iclog->ic_log->l_icloglock)
{
- /* the data section must be 32 bit size aligned */
- struct xfs_unmount_log_format magic = {
- .magic = XLOG_UNMOUNT_TYPE,
+ struct xlog *log = iclog->ic_log;
+
+ trace_xlog_iclog_wait_on(iclog, _RET_IP_);
+ if (!xlog_is_shutdown(log) &&
+ iclog->ic_state != XLOG_STATE_ACTIVE &&
+ iclog->ic_state != XLOG_STATE_DIRTY) {
+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+
+ if (xlog_is_shutdown(log))
+ return -EIO;
+ return 0;
+}
+
+/*
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
+ */
+static int
+xlog_write_unmount_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket)
+{
+ struct {
+ struct xlog_op_header ophdr;
+ struct xfs_unmount_log_format ulf;
+ } unmount_rec = {
+ .ophdr = {
+ .oh_clientid = XFS_LOG,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_UNMOUNT_TRANS,
+ },
+ .ulf = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ },
};
struct xfs_log_iovec reg = {
- .i_addr = &magic,
- .i_len = sizeof(magic),
+ .i_addr = &unmount_rec,
+ .i_len = sizeof(unmount_rec),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
- struct xlog *log = mp->m_log;
+ LIST_HEAD(lv_chain);
+ list_add(&vec.lv_list, &lv_chain);
+
+ BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_unmount_log_format)) !=
+ sizeof(unmount_rec));
+
+ /* account for space used by record data */
+ ticket->t_curr_res -= sizeof(unmount_rec);
+
+ return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
- xfs_lsn_t lsn;
- uint flags = XLOG_UNMOUNT_TRANS;
int error;
- error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+ error = xfs_log_reserve(mp, 600, 1, &tic, 0);
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
- /* remove inited flag, and account for space used */
- tic->t_flags = 0;
- tic->t_curr_res -= sizeof(magic);
- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+ error = xlog_write_unmount_record(log, tic);
/*
* At this point, we're umounting anyway, so there's no point in
- * transitioning log state to IOERROR. Just continue...
+ * transitioning log state to shutdown. Just continue...
*/
out_err:
if (error)
@@ -918,29 +987,27 @@ out_err:
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_want_sync(log, iclog);
- error = xlog_state_release_iclog(log, iclog);
- switch (iclog->ic_state) {
- default:
- if (!XLOG_FORCED_SHUTDOWN(log)) {
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- break;
- }
- /* fall through */
- case XLOG_STATE_ACTIVE:
- case XLOG_STATE_DIRTY:
- spin_unlock(&log->l_icloglock);
- break;
- }
+ error = xlog_force_iclog(iclog);
+ xlog_wait_on_iclog(iclog);
if (tic) {
trace_xfs_log_umount_write(log, tic);
- xlog_ungrant_log_space(log, tic);
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
}
}
+static void
+xfs_log_unmount_verify_iclog(
+ struct xlog *log)
+{
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ do {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ ASSERT(iclog->ic_offset == 0);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
/*
* Unmount record used to have a string "Unmount filesystem--" in the
* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
@@ -948,75 +1015,36 @@ out_err:
* currently architecture converted and "Unmount" is a bit foo.
* As far as I know, there weren't any dependencies on the old behaviour.
*/
-
-static int
-xfs_log_unmount_write(xfs_mount_t *mp)
+static void
+xfs_log_unmount_write(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- xlog_in_core_t *iclog;
-#ifdef DEBUG
- xlog_in_core_t *first_iclog;
-#endif
- int error;
+ struct xlog *log = mp->m_log;
- /*
- * Don't write out unmount record on norecovery mounts or ro devices.
- * Or, if we are doing a forced umount (typically because of IO errors).
- */
- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_targ)) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
- return 0;
- }
+ if (!xfs_log_writable(mp))
+ return;
- error = xfs_log_force(mp, XFS_LOG_SYNC);
- ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+ xfs_log_force(mp, XFS_LOG_SYNC);
-#ifdef DEBUG
- first_iclog = iclog = log->l_iclog;
- do {
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
- ASSERT(iclog->ic_offset == 0);
- }
- iclog = iclog->ic_next;
- } while (iclog != first_iclog);
-#endif
- if (! (XLOG_FORCED_SHUTDOWN(log))) {
- xfs_log_write_unmount_record(mp);
- } else {
- /*
- * We're already in forced_shutdown mode, couldn't
- * even attempt to write out the unmount transaction.
- *
- * Go through the motions of sync'ing and releasing
- * the iclog, even though no I/O will actually happen,
- * we need to wait for other log I/Os that may already
- * be in progress. Do this as a separate section of
- * code so we'll know if we ever get stuck here that
- * we're in this odd situation of trying to unmount
- * a file system that went into forced_shutdown as
- * the result of an unmount..
- */
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_want_sync(log, iclog);
- error = xlog_state_release_iclog(log, iclog);
- switch (iclog->ic_state) {
- case XLOG_STATE_ACTIVE:
- case XLOG_STATE_DIRTY:
- case XLOG_STATE_IOERROR:
- spin_unlock(&log->l_icloglock);
- break;
- default:
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- break;
- }
+ if (xlog_is_shutdown(log))
+ return;
+
+ /*
+ * If we think the summary counters are bad, avoid writing the unmount
+ * record to force log recovery at next mount, after which the summary
+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
+ * more details.
+ */
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp, "%s: will fix summary counters at next mount",
+ __func__);
+ return;
}
- return error;
-} /* xfs_log_unmount_write */
+ xfs_log_unmount_verify_iclog(log);
+ xlog_unmount_write(log);
+}
/*
* Empty the log for unmount/freeze.
@@ -1024,28 +1052,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
* To do this, we first need to shut down the background log work so it is not
* trying to cover the log as we clean up. We then need to unpin all objects in
* the log so we can then flush them out. Once they have completed their IO and
- * run the callbacks removing themselves from the AIL, we can write the unmount
- * record.
+ * run the callbacks removing themselves from the AIL, we can cover the log.
*/
-void
+int
xfs_log_quiesce(
struct xfs_mount *mp)
{
+ /*
+ * Clear log incompat features since we're quiescing the log. Report
+ * failures, though it's not fatal to have a higher log feature
+ * protection level than the log contents actually require.
+ */
+ if (xfs_clear_incompat_log_features(mp)) {
+ int error;
+
+ error = xfs_sync_sb(mp, false);
+ if (error)
+ xfs_warn(mp,
+ "Failed to clear log incompat features on quiesce");
+ }
+
cancel_delayed_work_sync(&mp->m_log->l_work);
xfs_log_force(mp, XFS_LOG_SYNC);
/*
* The superblock buffer is uncached and while xfs_ail_push_all_sync()
- * will push it, xfs_wait_buftarg() will not wait for it. Further,
+ * will push it, xfs_buftarg_wait() will not wait for it. Further,
* xfs_buf_iowait() cannot be used because it was pushed with the
* XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
* the IO to complete.
*/
xfs_ail_push_all_sync(mp->m_ail);
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_wait(mp->m_ddev_targp);
xfs_buf_lock(mp->m_sb_bp);
xfs_buf_unlock(mp->m_sb_bp);
+ return xfs_log_cover(mp);
+}
+
+void
+xfs_log_clean(
+ struct xfs_mount *mp)
+{
+ xfs_log_quiesce(mp);
xfs_log_unmount_write(mp);
}
@@ -1060,7 +1109,9 @@ void
xfs_log_unmount(
struct xfs_mount *mp)
{
- xfs_log_quiesce(mp);
+ xfs_log_clean(mp);
+
+ xfs_buftarg_drain(mp->m_ddev_targp);
xfs_trans_ail_destroy(mp);
@@ -1076,7 +1127,7 @@ xfs_log_item_init(
int type,
const struct xfs_item_ops *ops)
{
- item->li_mountp = mp;
+ item->li_log = mp->m_log;
item->li_ailp = mp->m_ail;
item->li_type = type;
item->li_ops = ops;
@@ -1098,11 +1149,11 @@ xfs_log_space_wake(
struct xlog *log = mp->m_log;
int free_bytes;
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
return;
if (!list_empty_careful(&log->l_write_head.waiters)) {
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+ ASSERT(!xlog_in_recovery(log));
spin_lock(&log->l_write_head.lock);
free_bytes = xlog_space_left(log, &log->l_write_head.grant);
@@ -1111,7 +1162,7 @@ xfs_log_space_wake(
}
if (!list_empty_careful(&log->l_reserve_head.waiters)) {
- ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+ ASSERT(!xlog_in_recovery(log));
spin_lock(&log->l_reserve_head.lock);
free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
@@ -1137,17 +1188,15 @@ xfs_log_space_wake(
* there's no point in running a dummy transaction at this point because we
* can't start trying to idle the log until both the CIL and AIL are empty.
*/
-static int
-xfs_log_need_covered(xfs_mount_t *mp)
+static bool
+xfs_log_need_covered(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- int needed = 0;
-
- if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
- return 0;
+ struct xlog *log = mp->m_log;
+ bool needed = false;
if (!xlog_cil_empty(log))
- return 0;
+ return false;
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
@@ -1162,14 +1211,14 @@ xfs_log_need_covered(xfs_mount_t *mp)
if (!xlog_iclogs_empty(log))
break;
- needed = 1;
+ needed = true;
if (log->l_covered_state == XLOG_STATE_COVER_NEED)
log->l_covered_state = XLOG_STATE_COVER_DONE;
else
log->l_covered_state = XLOG_STATE_COVER_DONE2;
break;
default:
- needed = 1;
+ needed = true;
break;
}
spin_unlock(&log->l_icloglock);
@@ -1177,6 +1226,60 @@ xfs_log_need_covered(xfs_mount_t *mp)
}
/*
+ * Explicitly cover the log. This is similar to background log covering but
+ * intended for usage in quiesce codepaths. The caller is responsible to ensure
+ * the log is idle and suitable for covering. The CIL, iclog buffers and AIL
+ * must all be empty.
+ */
+static int
+xfs_log_cover(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+ bool need_covered;
+
+ ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
+ !xfs_ail_min_lsn(mp->m_log->l_ailp)) ||
+ xlog_is_shutdown(mp->m_log));
+
+ if (!xfs_log_writable(mp))
+ return 0;
+
+ /*
+ * xfs_log_need_covered() is not idempotent because it progresses the
+ * state machine if the log requires covering. Therefore, we must call
+ * this function once and use the result until we've issued an sb sync.
+ * Do so first to make that abundantly clear.
+ *
+ * Fall into the covering sequence if the log needs covering or the
+ * mount has lazy superblock accounting to sync to disk. The sb sync
+ * used for covering accumulates the in-core counters, so covering
+ * handles this for us.
+ */
+ need_covered = xfs_log_need_covered(mp);
+ if (!need_covered && !xfs_has_lazysbcount(mp))
+ return 0;
+
+ /*
+ * To cover the log, commit the superblock twice (at most) in
+ * independent checkpoints. The first serves as a reference for the
+ * tail pointer. The sync transaction and AIL push empties the AIL and
+ * updates the in-core tail to the LSN of the first checkpoint. The
+ * second commit updates the on-disk tail with the in-core LSN,
+ * covering the log. Push the AIL one more time to leave it empty, as
+ * we found it.
+ */
+ do {
+ error = xfs_sync_sb(mp, true);
+ if (error)
+ break;
+ xfs_ail_push_all_sync(mp->m_ail);
+ } while (xfs_log_need_covered(mp));
+
+ return error;
+}
+
+/*
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
@@ -1227,16 +1330,18 @@ xlog_assign_tail_lsn(
* wrap the tail, we should blow up. Rather than catch this case here,
* we depend on other ASSERTions in other parts of the code. XXXmiken
*
- * This code also handles the case where the reservation head is behind
- * the tail. The details of this case are described below, but the end
- * result is that we return the size of the log as the amount of space left.
+ * If reservation head is behind the tail, we have a problem. Warn about it,
+ * but then treat it as if the log is empty.
+ *
+ * If the log is shut down, the head and tail may be invalid or out of whack, so
+ * shortcut invalidity asserts in this case so that we don't trigger them
+ * falsely.
*/
STATIC int
xlog_space_left(
struct xlog *log,
atomic64_t *head)
{
- int free_bytes;
int tail_bytes;
int tail_cycle;
int head_cycle;
@@ -1246,29 +1351,30 @@ xlog_space_left(
xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
tail_bytes = BBTOB(tail_bytes);
if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
- free_bytes = log->l_logsize - (head_bytes - tail_bytes);
- else if (tail_cycle + 1 < head_cycle)
+ return log->l_logsize - (head_bytes - tail_bytes);
+ if (tail_cycle + 1 < head_cycle)
return 0;
- else if (tail_cycle < head_cycle) {
+
+ /* Ignore potential inconsistency when shutdown. */
+ if (xlog_is_shutdown(log))
+ return log->l_logsize;
+
+ if (tail_cycle < head_cycle) {
ASSERT(tail_cycle == (head_cycle - 1));
- free_bytes = tail_bytes - head_bytes;
- } else {
- /*
- * The reservation head is behind the tail.
- * In this case we just want to return the size of the
- * log as the amount of space left.
- */
- xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
- xfs_alert(log->l_mp,
- " tail_cycle = %d, tail_bytes = %d",
- tail_cycle, tail_bytes);
- xfs_alert(log->l_mp,
- " GH cycle = %d, GH bytes = %d",
- head_cycle, head_bytes);
- ASSERT(0);
- free_bytes = log->l_logsize;
+ return tail_bytes - head_bytes;
}
- return free_bytes;
+
+ /*
+ * The reservation head is behind the tail. In this case we just want to
+ * return the size of the log as the amount of space left.
+ */
+ xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
+ xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d",
+ tail_cycle, tail_bytes);
+ xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d",
+ head_cycle, head_bytes);
+ ASSERT(0);
+ return log->l_logsize;
}
@@ -1279,7 +1385,6 @@ xlog_ioend_work(
struct xlog_in_core *iclog =
container_of(work, struct xlog_in_core, ic_end_io_work);
struct xlog *log = iclog->ic_log;
- bool aborted = false;
int error;
error = blk_status_to_errno(iclog->ic_bio.bi_status);
@@ -1294,18 +1399,10 @@ xlog_ioend_work(
*/
if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
- /*
- * This flag will be propagated to the trans-committed
- * callback routines to let them know that the log-commit
- * didn't succeed.
- */
- aborted = true;
- } else if (iclog->ic_state == XLOG_STATE_IOERROR) {
- aborted = true;
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
- xlog_state_done_syncing(iclog, aborted);
+ xlog_state_done_syncing(iclog);
bio_uninit(&iclog->ic_bio);
/*
@@ -1355,6 +1452,32 @@ xfs_log_work_queue(
}
/*
+ * Clear the log incompat flags if we have the opportunity.
+ *
+ * This only happens if we're about to log the second dummy transaction as part
+ * of covering the log and we can get the log incompat feature usage lock.
+ */
+static inline void
+xlog_clear_incompat(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
+
+ if (!xfs_sb_has_incompat_log_feature(&mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_ALL))
+ return;
+
+ if (log->l_covered_state != XLOG_STATE_COVER_DONE2)
+ return;
+
+ if (!down_write_trylock(&log->l_incompat_users))
+ return;
+
+ xfs_clear_incompat_log_features(mp);
+ up_write(&log->l_incompat_users);
+}
+
+/*
* Every sync period we need to unpin all items in the AIL and push them to
* disk. If there is nothing dirty, then we might need to cover the log to
* indicate that the filesystem is idle.
@@ -1368,7 +1491,7 @@ xfs_log_worker(
struct xfs_mount *mp = log->l_mp;
/* dgc: errors ignored - not fatal and nowhere to report them */
- if (xfs_log_need_covered(mp)) {
+ if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) {
/*
* Dump a transaction into the log that contains no real change.
* This is needed to stamp the current tail LSN into the log
@@ -1380,6 +1503,7 @@ xfs_log_worker(
* synchronously log the superblock instead to ensure the
* superblock is immediately unpinned and can be written back.
*/
+ xlog_clear_incompat(log);
xfs_sync_sb(mp, true);
} else
xfs_log_force(mp, 0);
@@ -1423,7 +1547,7 @@ xlog_alloc_log(
log->l_logBBstart = blk_offset;
log->l_logBBsize = num_bblks;
log->l_covered_state = XLOG_STATE_COVER_IDLE;
- log->l_flags |= XLOG_ACTIVE_RECOVERY;
+ set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
log->l_prev_block = -1;
@@ -1432,11 +1556,16 @@ xlog_alloc_log(
xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
+ if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
+ log->l_iclog_roundoff = mp->m_sb.sb_logsunit;
+ else
+ log->l_iclog_roundoff = BBSIZE;
+
xlog_grant_head_init(&log->l_reserve_head);
xlog_grant_head_init(&log->l_write_head);
error = -EFSCORRUPTED;
- if (xfs_sb_version_hassector(&mp->m_sb)) {
+ if (xfs_has_sector(mp)) {
log2_size = mp->m_sb.sb_logsectlog;
if (log2_size < BBSHIFT) {
xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
@@ -1453,7 +1582,7 @@ xlog_alloc_log(
/* for larger sector sizes, must have v2 or external log */
if (log2_size && log->l_logBBstart > 0 &&
- !xfs_sb_version_haslogv2(&mp->m_sb)) {
+ !xfs_has_logv2(mp)) {
xfs_warn(mp,
"log sector size (0x%x) invalid for configuration.",
log2_size);
@@ -1462,6 +1591,8 @@ xlog_alloc_log(
}
log->l_sectBBsize = 1 << log2_size;
+ init_rwsem(&log->l_incompat_users);
+
xlog_get_iclog_buffer_size(mp, log);
spin_lock_init(&log->l_icloglock);
@@ -1477,7 +1608,6 @@ xlog_alloc_log(
*/
ASSERT(log->l_iclog_size >= 4096);
for (i = 0; i < log->l_iclog_bufs; i++) {
- int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
@@ -1489,18 +1619,15 @@ xlog_alloc_log(
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
- KM_MAYFAIL | KM_ZERO);
+ iclog->ic_data = kvzalloc(log->l_iclog_size,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
-#ifdef DEBUG
- log->l_iclog_bak[i] = &iclog->ic_header;
-#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
head->h_version = cpu_to_be32(
- xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+ xfs_has_logv2(log->l_mp) ? 2 : 1);
head->h_size = cpu_to_be32(log->l_iclog_size);
/* new fields */
head->h_fmt = cpu_to_be32(XLOG_FMT);
@@ -1510,9 +1637,8 @@ xlog_alloc_log(
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
- spin_lock_init(&iclog->ic_callback_lock);
INIT_LIST_HEAD(&iclog->ic_callbacks);
- iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+ iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1525,8 +1651,9 @@ xlog_alloc_log(
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
- mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
+ WQ_HIGHPRI),
+ 0, mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -1551,47 +1678,15 @@ out:
return ERR_PTR(error);
} /* xlog_alloc_log */
-
-/*
- * Write out the commit record of a transaction associated with the given
- * ticket. Return the lsn of the commit record.
- */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp)
-{
- struct xfs_mount *mp = log->l_mp;
- int error;
- struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
- .i_type = XLOG_REG_TYPE_COMMIT,
- };
- struct xfs_log_vec vec = {
- .lv_niovecs = 1,
- .lv_iovecp = &reg,
- };
-
- ASSERT_ALWAYS(iclog);
- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
- XLOG_COMMIT_TRANS);
- if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- return error;
-}
-
/*
- * Push on the buffer cache code if we ever use more than 75% of the on-disk
- * log space. This code pushes on the lsn which would supposedly free up
- * the 25% which we want to leave free. We may need to adopt a policy which
- * pushes on an lsn which is further along in the log once we reach the high
- * water mark. In this manner, we would be creating a low water mark.
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
*/
-STATIC void
-xlog_grant_push_ail(
+xfs_lsn_t
+xlog_grant_push_threshold(
struct xlog *log,
int need_bytes)
{
@@ -1617,7 +1712,7 @@ xlog_grant_push_ail(
free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
free_threshold = max(free_threshold, 256);
if (free_blocks >= free_threshold)
- return;
+ return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
&threshold_block);
@@ -1637,13 +1732,33 @@ xlog_grant_push_ail(
if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
threshold_lsn = last_sync_lsn;
+ return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark. In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn;
+
+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+ if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log))
+ return;
+
/*
* Get the transaction layer to kick the dirty buffers out to
* disk asynchronously. No point in trying to do this if
* the filesystem is shutting down.
*/
- if (!XLOG_FORCED_SHUTDOWN(log))
- xfs_ail_push(log->l_ailp, threshold_lsn);
+ xfs_ail_push(log->l_ailp, threshold_lsn);
}
/*
@@ -1671,7 +1786,7 @@ xlog_pack_data(
dp += BBSIZE;
}
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
xlog_in_core_2_t *xhdr = iclog->ic_data;
for ( ; i < BTOBB(size); i++) {
@@ -1708,14 +1823,12 @@ xlog_cksum(
offsetof(struct xlog_rec_header, h_crc));
/* ... then for additional cycle data for v2 logs ... */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
int i;
int xheads;
- xheads = size / XLOG_HEADER_CYCLE_SIZE;
- if (size % XLOG_HEADER_CYCLE_SIZE)
- xheads++;
+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
@@ -1739,7 +1852,7 @@ xlog_bio_end_io(
&iclog->ic_end_io_work);
}
-static void
+static int
xlog_map_iclog_data(
struct bio *bio,
void *data,
@@ -1750,11 +1863,14 @@ xlog_map_iclog_data(
unsigned int off = offset_in_page(data);
size_t len = min_t(size_t, count, PAGE_SIZE - off);
- WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len);
+ if (bio_add_page(bio, page, len, off) != len)
+ return -EIO;
data += len;
count -= len;
} while (count);
+
+ return 0;
}
STATIC void
@@ -1762,10 +1878,10 @@ xlog_write_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
uint64_t bno,
- unsigned int count,
- bool need_flush)
+ unsigned int count)
{
ASSERT(bno < log->l_logBBsize);
+ trace_xlog_iclog_write(iclog, _RET_IP_);
/*
* We lock the iclogbufs here so that we can serialise against I/O
@@ -1776,7 +1892,7 @@ xlog_write_iclog(
* across the log IO to archieve that.
*/
down(&iclog->ic_sema);
- if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
+ if (xlog_is_shutdown(log)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
@@ -1784,21 +1900,52 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog, true);
+ xlog_state_done_syncing(iclog);
up(&iclog->ic_sema);
return;
}
- bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
- bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
+ /*
+ * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+ * IOs coming immediately after this one. This prevents the block layer
+ * writeback throttle from throttling log writes behind background
+ * metadata writeback and causing priority inversions.
+ */
+ bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+ howmany(count, PAGE_SIZE),
+ REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
iclog->ic_bio.bi_end_io = xlog_bio_end_io;
iclog->ic_bio.bi_private = iclog;
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
- if (need_flush)
+
+ if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+ /*
+ * For external log devices, we also need to flush the data
+ * device cache first to ensure all metadata writeback covered
+ * by the LSN in this iclog is on stable storage. This is slow,
+ * but it *must* complete before we issue the external log IO.
+ *
+ * If the flush fails, we cannot conclude that past metadata
+ * writeback from the log succeeded. Repeating the flush is
+ * not possible, hence we must shut down with log IO error to
+ * avoid shutdown re-entering this path and erroring out again.
+ */
+ if (log->l_targ != log->l_mp->m_ddev_targp &&
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
+ }
+ if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
+ iclog->ic_bio.bi_opf |= REQ_FUA;
+
+ iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count);
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
@@ -1852,34 +1999,20 @@ xlog_calc_iclog_size(
uint32_t *roundoff)
{
uint32_t count_init, count;
- bool use_lsunit;
-
- use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1;
/* Add for LR header */
count_init = log->l_iclog_hsize + iclog->ic_offset;
+ count = roundup(count_init, log->l_iclog_roundoff);
- /* Round out the log write size */
- if (use_lsunit) {
- /* we have a v2 stripe unit to use */
- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
- } else {
- count = BBTOB(BTOBB(count_init));
- }
-
- ASSERT(count >= count_init);
*roundoff = count - count_init;
- if (use_lsunit)
- ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
- else
- ASSERT(*roundoff < BBTOB(1));
+ ASSERT(count >= count_init);
+ ASSERT(*roundoff < log->l_iclog_roundoff);
return count;
}
/*
- * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
* fashion. Previously, we should have moved the current iclog
* ptr in the log to point to the next available iclog. This allows further
* write to continue while this code syncs out an iclog ready to go.
@@ -1904,28 +2037,37 @@ xlog_calc_iclog_size(
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
unsigned int count; /* byte count of bwrite */
unsigned int roundoff; /* roundoff to BB or stripe */
uint64_t bno;
unsigned int size;
- bool need_flush = true, split = false;
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+ trace_xlog_iclog_sync(iclog, _RET_IP_);
count = xlog_calc_iclog_size(log, iclog, &roundoff);
- /* move grant heads by roundoff in sync */
- xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
- xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ /*
+ * If we have a ticket, account for the roundoff via the ticket
+ * reservation to avoid touching the hot grant heads needlessly.
+ * Otherwise, we have to move grant heads directly.
+ */
+ if (ticket) {
+ ticket->t_curr_res -= roundoff;
+ } else {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+ xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ }
/* put cycle number in every block */
- xlog_pack_data(log, iclog, roundoff);
+ xlog_pack_data(log, iclog, roundoff);
/* real byte length */
size = iclog->ic_offset;
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
+ if (xfs_has_logv2(log->l_mp))
size += roundoff;
iclog->ic_header.h_len = cpu_to_be32(size);
@@ -1935,10 +2077,8 @@ xlog_sync(
bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
/* Do we need to split this write into 2 parts? */
- if (bno + BTOBB(count) > log->l_logBBsize) {
+ if (bno + BTOBB(count) > log->l_logBBsize)
xlog_split_iclog(log, &iclog->ic_header, bno, count);
- split = true;
- }
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
@@ -1959,22 +2099,8 @@ xlog_sync(
be64_to_cpu(iclog->ic_header.h_lsn));
}
#endif
-
- /*
- * Flush the data device before flushing the log to make sure all meta
- * data written back from the AIL actually made it to disk before
- * stamping the new log tail LSN into the log buffer. For an external
- * log we need to issue the flush explicitly, and unfortunately
- * synchronously here; for an internal log we can simply use the block
- * layer state machine for preflushes.
- */
- if (log->l_targ != log->l_mp->m_ddev_targp || split) {
- xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- need_flush = false;
- }
-
xlog_verify_iclog(log, iclog, count);
- xlog_write_iclog(log, iclog, bno, count, need_flush);
+ xlog_write_iclog(log, iclog, bno, count);
}
/*
@@ -1987,8 +2113,6 @@ xlog_dealloc_log(
xlog_in_core_t *iclog, *next_iclog;
int i;
- xlog_cil_destroy(log);
-
/*
* Cycle all the iclogbuf locks to make sure all log IO completion
* is done before we tear down these buffers.
@@ -2000,6 +2124,13 @@ xlog_dealloc_log(
iclog = iclog->ic_next;
}
+ /*
+ * Destroy the CIL after waiting for iclog IO completion because an
+ * iclog EIO error will try to shut down the log, which accesses the
+ * CIL to wake up the waiters.
+ */
+ xlog_cil_destroy(log);
+
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
@@ -2011,7 +2142,7 @@ xlog_dealloc_log(
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
-} /* xlog_dealloc_log */
+}
/*
* Update counters atomically now that memcpy is done.
@@ -2038,63 +2169,11 @@ xlog_print_tic_res(
struct xfs_mount *mp,
struct xlog_ticket *ticket)
{
- uint i;
- uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
- /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[] = {
- REG_TYPE_STR(BFORMAT, "bformat"),
- REG_TYPE_STR(BCHUNK, "bchunk"),
- REG_TYPE_STR(EFI_FORMAT, "efi_format"),
- REG_TYPE_STR(EFD_FORMAT, "efd_format"),
- REG_TYPE_STR(IFORMAT, "iformat"),
- REG_TYPE_STR(ICORE, "icore"),
- REG_TYPE_STR(IEXT, "iext"),
- REG_TYPE_STR(IBROOT, "ibroot"),
- REG_TYPE_STR(ILOCAL, "ilocal"),
- REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
- REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
- REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
- REG_TYPE_STR(QFORMAT, "qformat"),
- REG_TYPE_STR(DQUOT, "dquot"),
- REG_TYPE_STR(QUOTAOFF, "quotaoff"),
- REG_TYPE_STR(LRHEADER, "LR header"),
- REG_TYPE_STR(UNMOUNT, "unmount"),
- REG_TYPE_STR(COMMIT, "commit"),
- REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create"),
- REG_TYPE_STR(RUI_FORMAT, "rui_format"),
- REG_TYPE_STR(RUD_FORMAT, "rud_format"),
- REG_TYPE_STR(CUI_FORMAT, "cui_format"),
- REG_TYPE_STR(CUD_FORMAT, "cud_format"),
- REG_TYPE_STR(BUI_FORMAT, "bui_format"),
- REG_TYPE_STR(BUD_FORMAT, "bud_format"),
- };
- BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
xfs_warn(mp, "ticket reservation summary:");
- xfs_warn(mp, " unit res = %d bytes",
- ticket->t_unit_res);
- xfs_warn(mp, " current res = %d bytes",
- ticket->t_curr_res);
- xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
- ticket->t_res_arr_sum, ticket->t_res_o_flow);
- xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
- ticket->t_res_num_ophdrs, ophdr_spc);
- xfs_warn(mp, " ophdr + reg = %u bytes",
- ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
- xfs_warn(mp, " num regions = %u",
- ticket->t_res_num);
-
- for (i = 0; i < ticket->t_res_num; i++) {
- uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes", i,
- ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type]),
- ticket->t_res_arr[i].r_len);
- }
+ xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
+ xfs_warn(mp, " original count = %d", ticket->t_ocnt);
+ xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
}
/*
@@ -2147,201 +2226,226 @@ xlog_print_trans(
}
}
-/*
- * Calculate the potential space needed by the log vector. Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
- */
-static int
-xlog_write_calc_vec_length(
- struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector)
+static inline void
+xlog_write_iovec(
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ void *data,
+ uint32_t write_len,
+ int *bytes_left,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- struct xfs_log_vec *lv;
- int headers = 0;
- int len = 0;
- int i;
-
- /* acct for start rec of xact */
- if (ticket->t_flags & XLOG_TIC_INITED)
- headers++;
-
- for (lv = log_vector; lv; lv = lv->lv_next) {
- /* we don't write ordered log vectors */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
- continue;
-
- headers += lv->lv_niovecs;
-
- for (i = 0; i < lv->lv_niovecs; i++) {
- struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+ ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+ ASSERT(*log_offset % sizeof(int32_t) == 0);
+ ASSERT(write_len % sizeof(int32_t) == 0);
- len += vecp->i_len;
- xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
- }
- }
-
- ticket->t_res_num_ophdrs += headers;
- len += headers * sizeof(struct xlog_op_header);
-
- return len;
+ memcpy(iclog->ic_datap + *log_offset, data, write_len);
+ *log_offset += write_len;
+ *bytes_left -= write_len;
+ (*record_cnt)++;
+ *data_cnt += write_len;
}
/*
- * If first write for transaction, insert start record We can't be trying to
- * commit if we are inited. We can't have any "partial_copy" if we are inited.
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
*/
-static int
-xlog_write_start_rec(
- struct xlog_op_header *ophdr,
- struct xlog_ticket *ticket)
+static void
+xlog_write_full(
+ struct xfs_log_vec *lv,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- if (!(ticket->t_flags & XLOG_TIC_INITED))
- return 0;
+ int index;
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_len = 0;
- ophdr->oh_flags = XLOG_START_TRANS;
- ophdr->oh_res2 = 0;
+ ASSERT(*log_offset + *len <= iclog->ic_size ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
- ticket->t_flags &= ~XLOG_TIC_INITED;
+ /*
+ * Ordered log vectors have no regions to write so this
+ * loop will naturally skip them.
+ */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ struct xlog_op_header *ophdr = reg->i_addr;
- return sizeof(struct xlog_op_header);
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ reg->i_len, len, record_cnt, data_cnt);
+ }
}
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
- struct xlog *log,
- struct xlog_op_header *ophdr,
+static int
+xlog_write_get_more_iclog_space(
struct xlog_ticket *ticket,
- uint flags)
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_res2 = 0;
-
- /* are we copying a commit or unmount record? */
- ophdr->oh_flags = flags;
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog *log = iclog->ic_log;
+ int error;
- /*
- * We've seen logs corrupted with bad transaction client ids. This
- * makes sure that XFS doesn't generate them on. Turn this into an EIO
- * and shut down the filesystem.
- */
- switch (ophdr->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_warn(log->l_mp,
- "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
- ophdr->oh_clientid, ticket);
- return NULL;
- }
+ spin_lock(&log->l_icloglock);
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ error = xlog_state_release_iclog(log, iclog, ticket);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
- return ophdr;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ log_offset);
+ if (error)
+ return error;
+ *record_cnt = 0;
+ *data_cnt = 0;
+ *iclogp = iclog;
+ return 0;
}
/*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
*/
static int
-xlog_write_setup_copy(
+xlog_write_partial(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xlog_op_header *ophdr,
- int space_available,
- int space_required,
- int *copy_off,
- int *copy_len,
- int *last_was_partial_copy,
- int *bytes_consumed)
-{
- int still_to_copy;
-
- still_to_copy = space_required - *bytes_consumed;
- *copy_off = *bytes_consumed;
-
- if (still_to_copy <= space_available) {
- /* write of region completes here */
- *copy_len = still_to_copy;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- if (*last_was_partial_copy)
- ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- *last_was_partial_copy = 0;
- *bytes_consumed = 0;
- return 0;
- }
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog_op_header *ophdr;
+ int index = 0;
+ uint32_t rlen;
+ int error;
- /* partial write of region, needs extra log op header reservation */
- *copy_len = space_available;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (*last_was_partial_copy)
- ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
- *bytes_consumed += *copy_len;
- (*last_was_partial_copy)++;
+ /* walk the logvec, copying until we run out of space in the iclog */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ uint32_t reg_offset = 0;
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
- ticket->t_res_num_ophdrs++;
+ /*
+ * The first region of a continuation must have a non-zero
+ * length otherwise log recovery will just skip over it and
+ * start recovering from the next opheader it finds. Because we
+ * mark the next opheader as a continuation, recovery will then
+ * incorrectly add the continuation to the previous region and
+ * that breaks stuff.
+ *
+ * Hence if there isn't space for region data after the
+ * opheader, then we need to start afresh with a new iclog.
+ */
+ if (iclog->ic_size - *log_offset <=
+ sizeof(struct xlog_op_header)) {
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return error;
+ }
- return sizeof(struct xlog_op_header);
-}
+ ophdr = reg->i_addr;
+ rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
-static int
-xlog_write_copy_finish(
- struct xlog *log,
- struct xlog_in_core *iclog,
- uint flags,
- int *record_cnt,
- int *data_cnt,
- int *partial_copy,
- int *partial_copy_len,
- int log_offset,
- struct xlog_in_core **commit_iclog)
-{
- int error;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+ if (rlen != reg->i_len)
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ rlen, len, record_cnt, data_cnt);
+
+ /* If we wrote the whole region, move to the next. */
+ if (rlen == reg->i_len)
+ continue;
- if (*partial_copy) {
/*
- * This iclog has already been marked WANT_SYNC by
- * xlog_state_get_iclog_space.
+ * We now have a partially written iovec, but it can span
+ * multiple iclogs so we loop here. First we release the iclog
+ * we currently have, then we get a new iclog and add a new
+ * opheader. Then we continue copying from where we were until
+ * we either complete the iovec or fill the iclog. If we
+ * complete the iovec, then we increment the index and go right
+ * back to the top of the outer loop. if we fill the iclog, we
+ * run the inner loop again.
+ *
+ * This is complicated by the tail of a region using all the
+ * space in an iclog and hence requiring us to release the iclog
+ * and get a new one before returning to the outer loop. We must
+ * always guarantee that we exit this inner loop with at least
+ * space for log transaction opheaders left in the current
+ * iclog, hence we cannot just terminate the loop at the end
+ * of the of the continuation. So we loop while there is no
+ * space left in the current iclog, and check for the end of the
+ * continuation after getting a new iclog.
*/
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
- goto release_iclog;
- }
+ do {
+ /*
+ * Ensure we include the continuation opheader in the
+ * space we need in the new iclog by adding that size
+ * to the length we require. This continuation opheader
+ * needs to be accounted to the ticket as the space it
+ * consumes hasn't been accounted to the lv we are
+ * writing.
+ */
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset,
+ *len + sizeof(struct xlog_op_header),
+ record_cnt, data_cnt);
+ if (error)
+ return error;
- *partial_copy = 0;
- *partial_copy_len = 0;
+ ophdr = iclog->ic_datap + *log_offset;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = XFS_TRANSACTION;
+ ophdr->oh_res2 = 0;
+ ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
- if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ *log_offset += sizeof(struct xlog_op_header);
+ *data_cnt += sizeof(struct xlog_op_header);
- xlog_state_want_sync(log, iclog);
- if (!commit_iclog)
- goto release_iclog;
- spin_unlock(&log->l_icloglock);
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
+ /*
+ * If rlen fits in the iclog, then end the region
+ * continuation. Otherwise we're going around again.
+ */
+ reg_offset += rlen;
+ rlen = reg->i_len - reg_offset;
+ if (rlen <= iclog->ic_size - *log_offset)
+ ophdr->oh_flags |= XLOG_END_TRANS;
+ else
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+ rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+ ophdr->oh_len = cpu_to_be32(rlen);
+
+ xlog_write_iovec(iclog, log_offset,
+ reg->i_addr + reg_offset,
+ rlen, len, record_cnt, data_cnt);
+
+ } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
}
+ /*
+ * No more iovecs remain in this logvec so return the next log vec to
+ * the caller so it can go back to fast path copying.
+ */
+ *iclogp = iclog;
return 0;
-
-release_iclog:
- error = xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- return error;
}
/*
@@ -2387,299 +2491,187 @@ release_iclog:
int
xlog_write(
struct xlog *log,
- struct xfs_log_vec *log_vector,
+ struct xfs_cil_ctx *ctx,
+ struct list_head *lv_chain,
struct xlog_ticket *ticket,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
- uint flags)
+ uint32_t len)
+
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_iovec *vecp;
struct xfs_log_vec *lv;
- int len;
- int index;
- int partial_copy = 0;
- int partial_copy_len = 0;
- int contwr = 0;
- int record_cnt = 0;
- int data_cnt = 0;
+ uint32_t record_cnt = 0;
+ uint32_t data_cnt = 0;
int error = 0;
-
- *start_lsn = 0;
-
- len = xlog_write_calc_vec_length(ticket, log_vector);
-
- /*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
- */
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
+ int log_offset;
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
xlog_print_tic_res(log->l_mp, ticket);
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
- index = 0;
- lv = log_vector;
- vecp = lv->lv_iovecp;
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- void *ptr;
- int log_offset;
-
- error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset);
- if (error)
- return error;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &log_offset);
+ if (error)
+ return error;
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = iclog->ic_datap + log_offset;
+ ASSERT(log_offset <= iclog->ic_size - 1);
- /* start_lsn is the first lsn written to. That's all we need. */
- if (!*start_lsn)
- *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ /*
+ * If we have a context pointer, pass it the first iclog we are
+ * writing to so it can record state needed for iclog write
+ * ordering.
+ */
+ if (ctx)
+ xlog_cil_set_ctx_write_state(ctx, iclog);
+ list_for_each_entry(lv, lv_chain, lv_list) {
/*
- * This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
+ * If the entire log vec does not fit in the iclog, punt it to
+ * the partial copy loop which can handle this case.
*/
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- struct xfs_log_iovec *reg;
- struct xlog_op_header *ophdr;
- int start_rec_copy;
- int copy_len;
- int copy_off;
- bool ordered = false;
-
- /* ordered log vectors have no regions to write */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
- ASSERT(lv->lv_niovecs == 0);
- ordered = true;
- goto next_lv;
- }
-
- reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
- start_rec_copy = xlog_write_start_rec(ptr, ticket);
- if (start_rec_copy) {
- record_cnt++;
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- start_rec_copy);
- }
-
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
- if (!ophdr)
- return -EIO;
-
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
-
- len += xlog_write_setup_copy(ticket, ophdr,
- iclog->ic_size-log_offset,
- reg->i_len,
- &copy_off, &copy_len,
- &partial_copy,
- &partial_copy_len);
- xlog_verify_dest_ptr(log, ptr);
-
- /*
- * Copy region.
- *
- * Unmount records just log an opheader, so can have
- * empty payloads with no data region to copy. Hence we
- * only copy the payload if the vector says it has data
- * to copy.
- */
- ASSERT(copy_len >= 0);
- if (copy_len > 0) {
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- copy_len);
- }
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
- record_cnt++;
- data_cnt += contwr ? copy_len : 0;
-
- error = xlog_write_copy_finish(log, iclog, flags,
- &record_cnt, &data_cnt,
- &partial_copy,
- &partial_copy_len,
- log_offset,
- commit_iclog);
- if (error)
+ if (lv->lv_niovecs &&
+ lv->lv_bytes > iclog->ic_size - log_offset) {
+ error = xlog_write_partial(lv, ticket, &iclog,
+ &log_offset, &len, &record_cnt,
+ &data_cnt);
+ if (error) {
+ /*
+ * We have no iclog to release, so just return
+ * the error immediately.
+ */
return error;
-
- /*
- * if we had a partial copy, we need to get more iclog
- * space but we don't want to increment the region
- * index because there is still more is this region to
- * write.
- *
- * If we completed writing this region, and we flushed
- * the iclog (indicated by resetting of the record
- * count), then we also need to get more log space. If
- * this was the last record, though, we are done and
- * can just return.
- */
- if (partial_copy)
- break;
-
- if (++index == lv->lv_niovecs) {
-next_lv:
- lv = lv->lv_next;
- index = 0;
- if (lv)
- vecp = lv->lv_iovecp;
- }
- if (record_cnt == 0 && !ordered) {
- if (!lv)
- return 0;
- break;
}
+ } else {
+ xlog_write_full(lv, ticket, iclog, &log_offset,
+ &len, &record_cnt, &data_cnt);
}
}
-
ASSERT(len == 0);
+ /*
+ * We've already been guaranteed that the last writes will fit inside
+ * the current iclog, and hence it will already have the space used by
+ * those writes accounted to it. Hence we do not need to update the
+ * iclog with the number of bytes written here.
+ */
spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (commit_iclog) {
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- } else {
- error = xlog_state_release_iclog(log, iclog);
- }
+ xlog_state_finish_copy(log, iclog, record_cnt, 0);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
return error;
}
+static void
+xlog_state_activate_iclog(
+ struct xlog_in_core *iclog,
+ int *iclogs_changed)
+{
+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
+ trace_xlog_iclog_activate(iclog, _RET_IP_);
-/*****************************************************************************
- *
- * State Machine functions
- *
- *****************************************************************************
- */
+ /*
+ * If the number of ops in this iclog indicate it just contains the
+ * dummy transaction, we can change state into IDLE (the second time
+ * around). Otherwise we should change the state into NEED a dummy.
+ * We don't need to cover the dummy.
+ */
+ if (*iclogs_changed == 0 &&
+ iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
+ *iclogs_changed = 1;
+ } else {
+ /*
+ * We have two dirty iclogs so start over. This could also be
+ * num of ops indicating this is not the dummy going out.
+ */
+ *iclogs_changed = 2;
+ }
+
+ iclog->ic_state = XLOG_STATE_ACTIVE;
+ iclog->ic_offset = 0;
+ iclog->ic_header.h_num_logops = 0;
+ memset(iclog->ic_header.h_cycle_data, 0,
+ sizeof(iclog->ic_header.h_cycle_data));
+ iclog->ic_header.h_lsn = 0;
+ iclog->ic_header.h_tail_lsn = 0;
+}
/*
- * An iclog has just finished IO completion processing, so we need to update
- * the iclog state and propagate that up into the overall log state. Hence we
- * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
- * starting from the head, and then wake up any threads that are waiting for the
- * iclog to be marked clean.
- *
- * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
- * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
- * maintain the notion that we use a ordered wait queue to hold off would be
- * writers to the log when every iclog is trying to sync to disk.
- *
- * Caller must hold the icloglock before calling us.
- *
- * State Change: !IOERROR -> DIRTY -> ACTIVE
+ * Loop through all iclogs and mark all iclogs currently marked DIRTY as
+ * ACTIVE after iclog I/O has completed.
*/
-STATIC void
-xlog_state_clean_iclog(
+static void
+xlog_state_activate_iclogs(
struct xlog *log,
- struct xlog_in_core *dirty_iclog)
+ int *iclogs_changed)
{
- struct xlog_in_core *iclog;
- int changed = 0;
-
- /* Prepare the completed iclog. */
- if (dirty_iclog->ic_state != XLOG_STATE_IOERROR)
- dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+ struct xlog_in_core *iclog = log->l_iclog;
- /* Walk all the iclogs to update the ordered active state. */
- iclog = log->l_iclog;
do {
- if (iclog->ic_state == XLOG_STATE_DIRTY) {
- iclog->ic_state = XLOG_STATE_ACTIVE;
- iclog->ic_offset = 0;
- ASSERT(list_empty_careful(&iclog->ic_callbacks));
- /*
- * If the number of ops in this iclog indicate it just
- * contains the dummy transaction, we can
- * change state into IDLE (the second time around).
- * Otherwise we should change the state into
- * NEED a dummy.
- * We don't need to cover the dummy.
- */
- if (!changed &&
- (be32_to_cpu(iclog->ic_header.h_num_logops) ==
- XLOG_COVER_OPS)) {
- changed = 1;
- } else {
- /*
- * We have two dirty iclogs so start over
- * This could also be num of ops indicates
- * this is not the dummy going out.
- */
- changed = 2;
- }
- iclog->ic_header.h_num_logops = 0;
- memset(iclog->ic_header.h_cycle_data, 0,
- sizeof(iclog->ic_header.h_cycle_data));
- iclog->ic_header.h_lsn = 0;
- } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
- /* do nothing */;
- else
- break; /* stop cleaning */
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
-
+ if (iclog->ic_state == XLOG_STATE_DIRTY)
+ xlog_state_activate_iclog(iclog, iclogs_changed);
+ /*
+ * The ordering of marking iclogs ACTIVE must be maintained, so
+ * an iclog doesn't become ACTIVE beyond one that is SYNCING.
+ */
+ else if (iclog->ic_state != XLOG_STATE_ACTIVE)
+ break;
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+static int
+xlog_covered_state(
+ int prev_state,
+ int iclogs_changed)
+{
/*
- * Wake up threads waiting in xfs_log_force() for the dirty iclog
- * to be cleaned.
+ * We go to NEED for any non-covering writes. We go to NEED2 if we just
+ * wrote the first covering record (DONE). We go to IDLE if we just
+ * wrote the second covering record (DONE2) and remain in IDLE until a
+ * non-covering write occurs.
*/
- wake_up_all(&dirty_iclog->ic_force_wait);
+ switch (prev_state) {
+ case XLOG_STATE_COVER_IDLE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
+ fallthrough;
+ case XLOG_STATE_COVER_NEED:
+ case XLOG_STATE_COVER_NEED2:
+ break;
+ case XLOG_STATE_COVER_DONE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_NEED2;
+ break;
+ case XLOG_STATE_COVER_DONE2:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
+ break;
+ default:
+ ASSERT(0);
+ }
- /*
- * Change state for the dummy log recording.
- * We usually go to NEED. But we go to NEED2 if the changed indicates
- * we are done writing the dummy record.
- * If we are done with the second dummy recored (DONE2), then
- * we go to IDLE.
- */
- if (changed) {
- switch (log->l_covered_state) {
- case XLOG_STATE_COVER_IDLE:
- case XLOG_STATE_COVER_NEED:
- case XLOG_STATE_COVER_NEED2:
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
+ return XLOG_STATE_COVER_NEED;
+}
- case XLOG_STATE_COVER_DONE:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_NEED2;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
+STATIC void
+xlog_state_clean_iclog(
+ struct xlog *log,
+ struct xlog_in_core *dirty_iclog)
+{
+ int iclogs_changed = 0;
- case XLOG_STATE_COVER_DONE2:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_IDLE;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
+ trace_xlog_iclog_clean(dirty_iclog, _RET_IP_);
- default:
- ASSERT(0);
- }
+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+ xlog_state_activate_iclogs(log, &iclogs_changed);
+ wake_up_all(&dirty_iclog->ic_force_wait);
+
+ if (iclogs_changed) {
+ log->l_covered_state = xlog_covered_state(log->l_covered_state,
+ iclogs_changed);
}
}
@@ -2731,6 +2723,7 @@ xlog_state_set_callback(
struct xlog_in_core *iclog,
xfs_lsn_t header_lsn)
{
+ trace_xlog_iclog_callback(iclog, _RET_IP_);
iclog->ic_state = XLOG_STATE_CALLBACK;
ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
@@ -2751,8 +2744,7 @@ xlog_state_set_callback(
static bool
xlog_state_iodone_process_iclog(
struct xlog *log,
- struct xlog_in_core *iclog,
- bool *ioerror)
+ struct xlog_in_core *iclog)
{
xfs_lsn_t lowest_lsn;
xfs_lsn_t header_lsn;
@@ -2764,15 +2756,6 @@ xlog_state_iodone_process_iclog(
* Skip all iclogs in the ACTIVE & DIRTY states:
*/
return false;
- case XLOG_STATE_IOERROR:
- /*
- * Between marking a filesystem SHUTDOWN and stopping the log,
- * we do flush all iclogs to disk (if there wasn't a log I/O
- * error). So, we do want things to go smoothly in case of just
- * a SHUTDOWN w/o a LOG_IO_ERROR.
- */
- *ioerror = true;
- return false;
case XLOG_STATE_DONE_SYNC:
/*
* Now that we have an iclog that is in the DONE_SYNC state, do
@@ -2797,104 +2780,74 @@ xlog_state_iodone_process_iclog(
}
/*
- * Keep processing entries in the iclog callback list until we come around and
- * it is empty. We need to atomically see that the list is empty and change the
- * state to DIRTY so that we don't miss any more callbacks being added.
- *
- * This function is called with the icloglock held and returns with it held. We
- * drop it while running callbacks, however, as holding it over thousands of
- * callbacks is unnecessary and causes excessive contention if we do.
+ * Loop over all the iclogs, running attached callbacks on them. Return true if
+ * we ran any callbacks, indicating that we dropped the icloglock. We don't need
+ * to handle transient shutdown state here at all because
+ * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown
+ * cleanup of the callbacks.
*/
-static void
+static bool
xlog_state_do_iclog_callbacks(
- struct xlog *log,
- struct xlog_in_core *iclog,
- bool aborted)
+ struct xlog *log)
__releases(&log->l_icloglock)
__acquires(&log->l_icloglock)
{
- spin_unlock(&log->l_icloglock);
- spin_lock(&iclog->ic_callback_lock);
- while (!list_empty(&iclog->ic_callbacks)) {
- LIST_HEAD(tmp);
+ struct xlog_in_core *first_iclog = log->l_iclog;
+ struct xlog_in_core *iclog = first_iclog;
+ bool ran_callback = false;
- list_splice_init(&iclog->ic_callbacks, &tmp);
+ do {
+ LIST_HEAD(cb_list);
- spin_unlock(&iclog->ic_callback_lock);
- xlog_cil_process_committed(&tmp, aborted);
- spin_lock(&iclog->ic_callback_lock);
- }
+ if (xlog_state_iodone_process_iclog(log, iclog))
+ break;
+ if (iclog->ic_state != XLOG_STATE_CALLBACK) {
+ iclog = iclog->ic_next;
+ continue;
+ }
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
+ spin_unlock(&log->l_icloglock);
- /*
- * Pick up the icloglock while still holding the callback lock so we
- * serialise against anyone trying to add more callbacks to this iclog
- * now we've finished processing.
- */
- spin_lock(&log->l_icloglock);
- spin_unlock(&iclog->ic_callback_lock);
+ trace_xlog_iclog_callbacks_start(iclog, _RET_IP_);
+ xlog_cil_process_committed(&cb_list);
+ trace_xlog_iclog_callbacks_done(iclog, _RET_IP_);
+ ran_callback = true;
+
+ spin_lock(&log->l_icloglock);
+ xlog_state_clean_iclog(log, iclog);
+ iclog = iclog->ic_next;
+ } while (iclog != first_iclog);
+
+ return ran_callback;
}
+
+/*
+ * Loop running iclog completion callbacks until there are no more iclogs in a
+ * state that can run callbacks.
+ */
STATIC void
xlog_state_do_callback(
- struct xlog *log,
- bool aborted)
+ struct xlog *log)
{
- struct xlog_in_core *iclog;
- struct xlog_in_core *first_iclog;
- bool cycled_icloglock;
- bool ioerror;
int flushcnt = 0;
int repeats = 0;
spin_lock(&log->l_icloglock);
- do {
- /*
- * Scan all iclogs starting with the one pointed to by the
- * log. Reset this starting point each time the log is
- * unlocked (during callbacks).
- *
- * Keep looping through iclogs until one full pass is made
- * without running any callbacks.
- */
- first_iclog = log->l_iclog;
- iclog = log->l_iclog;
- cycled_icloglock = false;
- ioerror = false;
- repeats++;
-
- do {
- if (xlog_state_iodone_process_iclog(log, iclog,
- &ioerror))
- break;
-
- if (iclog->ic_state != XLOG_STATE_CALLBACK &&
- iclog->ic_state != XLOG_STATE_IOERROR) {
- iclog = iclog->ic_next;
- continue;
- }
-
- /*
- * Running callbacks will drop the icloglock which means
- * we'll have to run at least one more complete loop.
- */
- cycled_icloglock = true;
- xlog_state_do_iclog_callbacks(log, iclog, aborted);
-
- xlog_state_clean_iclog(log, iclog);
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
+ while (xlog_state_do_iclog_callbacks(log)) {
+ if (xlog_is_shutdown(log))
+ break;
- if (repeats > 5000) {
+ if (++repeats > 5000) {
flushcnt += repeats;
repeats = 0;
xfs_warn(log->l_mp,
"%s: possible infinite loop (%d iterations)",
__func__, flushcnt);
}
- } while (!ioerror && cycled_icloglock);
+ }
- if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE ||
- log->l_iclog->ic_state == XLOG_STATE_IOERROR)
+ if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE)
wake_up_all(&log->l_flush_wait);
spin_unlock(&log->l_icloglock);
@@ -2904,37 +2857,28 @@ xlog_state_do_callback(
/*
* Finish transitioning this iclog to the dirty state.
*
- * Make sure that we completely execute this routine only when this is
- * the last call to the iclog. There is a good chance that iclog flushes,
- * when we reach the end of the physical log, get turned into 2 separate
- * calls to bwrite. Hence, one iclog flush could generate two calls to this
- * routine. By using the reference count bwritecnt, we guarantee that only
- * the second completion goes through.
- *
* Callbacks could take time, so they are done outside the scope of the
* global state machine log lock.
*/
STATIC void
xlog_state_done_syncing(
- struct xlog_in_core *iclog,
- bool aborted)
+ struct xlog_in_core *iclog)
{
struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
-
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+ trace_xlog_iclog_sync_done(iclog, _RET_IP_);
/*
* If we got an error, either on the first buffer, or in the case of
- * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
- * and none should ever be attempted to be written to disk
- * again.
+ * split log writes, on the second, we shut down the file system and
+ * no iclogs should ever be attempted to be written to disk again.
*/
- if (iclog->ic_state == XLOG_STATE_SYNCING)
+ if (!xlog_is_shutdown(log)) {
+ ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
iclog->ic_state = XLOG_STATE_DONE_SYNC;
- else
- ASSERT(iclog->ic_state == XLOG_STATE_IOERROR);
+ }
/*
* Someone could be sleeping prior to writing out the next
@@ -2943,9 +2887,8 @@ xlog_state_done_syncing(
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log, aborted); /* also cleans log */
-} /* xlog_state_done_syncing */
-
+ xlog_state_do_callback(log);
+}
/*
* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
@@ -2971,7 +2914,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclogp,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp)
{
int log_offset;
@@ -2980,7 +2922,7 @@ xlog_state_get_iclog_space(
restart:
spin_lock(&log->l_icloglock);
- if (XLOG_FORCED_SHUTDOWN(log)) {
+ if (xlog_is_shutdown(log)) {
spin_unlock(&log->l_icloglock);
return -EIO;
}
@@ -2999,6 +2941,8 @@ restart:
atomic_inc(&iclog->ic_refcnt); /* prevents sync */
log_offset = iclog->ic_offset;
+ trace_xlog_iclog_get_space(iclog, _RET_IP_);
+
/* On the 1st write to an iclog, figure out lsn. This works
* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
* committing to. If the offset is set, that's how many blocks
@@ -3006,9 +2950,6 @@ restart:
*/
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
- xlog_tic_add_region(ticket,
- log->l_iclog_hsize,
- XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3037,7 +2978,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3050,13 +2991,10 @@ restart:
* iclogs (to mark it taken), this particular iclog will release/sync
* to disk in xlog_write().
*/
- if (len <= iclog->ic_size - iclog->ic_offset) {
- *continued_write = 0;
+ if (len <= iclog->ic_size - iclog->ic_offset)
iclog->ic_offset += len;
- } else {
- *continued_write = 1;
+ else
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- }
*iclogp = iclog;
ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3064,21 +3002,21 @@ restart:
*logoffsetp = log_offset;
return 0;
-} /* xlog_state_get_iclog_space */
-
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth. Also
- * move grant reservation head forward.
+}
+
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount. Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth. Also move grant reservation head
+ * forward.
*/
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- trace_xfs_log_regrant_reserve_enter(log, ticket);
+ trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
@@ -3088,23 +3026,20 @@ xlog_regrant_reserve_log_space(
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
- trace_xfs_log_regrant_reserve_sub(log, ticket);
+ trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0)
- return;
-
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
+ if (!ticket->t_cnt) {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
+ ticket->t_unit_res);
+ trace_xfs_log_ticket_regrant_exit(log, ticket);
- trace_xfs_log_regrant_reserve_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
-} /* xlog_regrant_reserve_log_space */
+ ticket->t_curr_res = ticket->t_unit_res;
+ }
+ xfs_log_ticket_put(ticket);
+}
/*
* Give back the space left from a reservation.
@@ -3120,18 +3055,19 @@ xlog_regrant_reserve_log_space(
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- int bytes;
+ int bytes;
+
+ trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- trace_xfs_log_ungrant_enter(log, ticket);
- trace_xfs_log_ungrant_sub(log, ticket);
+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
@@ -3146,25 +3082,26 @@ xlog_ungrant_log_space(
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
- trace_xfs_log_ungrant_exit(log, ticket);
+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
+ xfs_log_ticket_put(ticket);
}
/*
- * This routine will mark the current iclog in the ring as WANT_SYNC
- * and move the current iclog pointer to the next iclog in the ring.
- * When this routine is called from xlog_state_get_iclog_space(), the
- * exact size of the iclog has not yet been determined. All we know is
- * that every data block. We have run out of space in this log record.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
*/
-STATIC void
+void
xlog_state_switch_iclogs(
struct xlog *log,
struct xlog_in_core *iclog,
int eventual_size)
{
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ assert_spin_locked(&log->l_icloglock);
+ trace_xlog_iclog_switch(iclog, _RET_IP_);
+
if (!eventual_size)
eventual_size = iclog->ic_offset;
iclog->ic_state = XLOG_STATE_WANT_SYNC;
@@ -3176,9 +3113,8 @@ xlog_state_switch_iclogs(
log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
/* Round up to next log-sunit */
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
- log->l_mp->m_sb.sb_logsunit > 1) {
- uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+ if (log->l_iclog_roundoff > BBSIZE) {
+ uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff);
log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
}
@@ -3199,7 +3135,36 @@ xlog_state_switch_iclogs(
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
-} /* xlog_state_switch_iclogs */
+}
+
+/*
+ * Force the iclog to disk and check if the iclog has been completed before
+ * xlog_force_iclog() returns. This can happen on synchronous (e.g.
+ * pmem) or fast async storage because we drop the icloglock to issue the IO.
+ * If completion has already occurred, tell the caller so that it can avoid an
+ * unnecessary wait on the iclog.
+ */
+static int
+xlog_force_and_check_iclog(
+ struct xlog_in_core *iclog,
+ bool *completed)
+{
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ int error;
+
+ *completed = false;
+ error = xlog_force_iclog(iclog);
+ if (error)
+ return error;
+
+ /*
+ * If the iclog has already been completed and reused the header LSN
+ * will have been rewritten by completion
+ */
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+ *completed = true;
+ return 0;
+}
/*
* Write out all data in the in-core log as of this exact moment in time.
@@ -3235,7 +3200,6 @@ xfs_log_force(
{
struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
- xfs_lsn_t lsn;
XFS_STATS_INC(mp, xs_log_force);
trace_xfs_log_force(mp, 0, _RET_IP_);
@@ -3243,10 +3207,12 @@ xfs_log_force(
xlog_cil_force(log);
spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- if (iclog->ic_state == XLOG_STATE_IOERROR)
+ if (xlog_is_shutdown(log))
goto out_error;
+ iclog = log->l_iclog;
+ trace_xlog_iclog_force(iclog, _RET_IP_);
+
if (iclog->ic_state == XLOG_STATE_DIRTY ||
(iclog->ic_state == XLOG_STATE_ACTIVE &&
atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) {
@@ -3259,56 +3225,37 @@ xfs_log_force(
* previous iclog and go to sleep.
*/
iclog = iclog->ic_prev;
- if (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
if (atomic_read(&iclog->ic_refcnt) == 0) {
- /*
- * We are the only one with access to this iclog.
- *
- * Flush it out now. There should be a roundoff of zero
- * to show that someone has already taken care of the
- * roundoff from the previous sync.
- */
- atomic_inc(&iclog->ic_refcnt);
- lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ /* We have exclusive access to this iclog. */
+ bool completed;
+
+ if (xlog_force_and_check_iclog(iclog, &completed))
goto out_error;
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn ||
- iclog->ic_state == XLOG_STATE_DIRTY)
+ if (completed)
goto out_unlock;
} else {
/*
- * Someone else is writing to this iclog.
- *
- * Use its call to flush out the data. However, the
- * other thread may not force out this LR, so we mark
- * it WANT_SYNC.
+ * Someone else is still writing to this iclog, so we
+ * need to ensure that when they release the iclog it
+ * gets synced immediately as we may be waiting on it.
*/
xlog_state_switch_iclogs(log, iclog, 0);
}
- } else {
- /*
- * If the head iclog is not active nor dirty, we just attach
- * ourselves to the head and go to sleep if necessary.
- */
- ;
}
- if (!(flags & XFS_LOG_SYNC))
- goto out_unlock;
-
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- goto out_error;
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
+ /*
+ * The iclog we are about to wait on may contain the checkpoint pushed
+ * by the above xlog_cil_force() call, but it may not have been pushed
+ * to disk yet. Like the ACTIVE case above, we need to make sure caches
+ * are flushed when this iclog is written.
+ */
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3317,32 +3264,45 @@ out_error:
return -EIO;
}
+/*
+ * Force the log to a specific LSN.
+ *
+ * If an iclog with that lsn can be found:
+ * If it is in the DIRTY state, just return.
+ * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ * state and go to sleep or return.
+ * If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a wait queue. All callers trying
+ * to force a given lsn to disk must wait on the queue attached to the
+ * specific in-core log. When given in-core log finally completes its write
+ * to disk, that thread will wake up all threads waiting on the queue.
+ */
static int
-__xfs_log_force_lsn(
- struct xfs_mount *mp,
+xlog_force_lsn(
+ struct xlog *log,
xfs_lsn_t lsn,
uint flags,
int *log_flushed,
bool already_slept)
{
- struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
+ bool completed;
spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- if (iclog->ic_state == XLOG_STATE_IOERROR)
+ if (xlog_is_shutdown(log))
goto out_error;
+ iclog = log->l_iclog;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+ trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
iclog = iclog->ic_next;
if (iclog == log->l_iclog)
goto out_unlock;
}
- if (iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
-
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
/*
* We sleep here if we haven't already slept (e.g. this is the
* first time we've looked at the correct iclog buf) and the
@@ -3361,34 +3321,39 @@ __xfs_log_force_lsn(
if (!already_slept &&
(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
- XFS_STATS_INC(mp, xs_log_force_sleep);
-
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
return -EAGAIN;
}
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_force_and_check_iclog(iclog, &completed))
goto out_error;
if (log_flushed)
*log_flushed = 1;
+ if (completed)
+ goto out_unlock;
+ break;
+ case XLOG_STATE_WANT_SYNC:
+ /*
+ * This iclog may contain the checkpoint pushed by the
+ * xlog_cil_force_seq() call, but there are other writers still
+ * accessing it so it hasn't been pushed to disk yet. Like the
+ * ACTIVE case above, we need to make sure caches are flushed
+ * when this iclog is written.
+ */
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ break;
+ default:
+ /*
+ * The entire checkpoint was written by the CIL force and is on
+ * its way to disk already. It will be stable when it
+ * completes, so we don't need to manipulate caches here at all.
+ * We just need to wait for completion if necessary.
+ */
+ break;
}
- if (!(flags & XFS_LOG_SYNC) ||
- (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY))
- goto out_unlock;
-
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- goto out_error;
-
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state == XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3398,70 +3363,42 @@ out_error:
}
/*
- * Force the in-core log to disk for a specific LSN.
- *
- * Find in-core log with lsn.
- * If it is in the DIRTY state, just return.
- * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
- * state and go to sleep or return.
- * If it is in any other state, go to sleep or return.
+ * Force the log to a specific checkpoint sequence.
*
- * Synchronous forces are implemented with a wait queue. All callers trying
- * to force a given lsn to disk must wait on the queue attached to the
- * specific in-core log. When given in-core log finally completes its write
- * to disk, that thread will wake up all threads waiting on the queue.
+ * First force the CIL so that all the required changes have been flushed to the
+ * iclogs. If the CIL force completed it will return a commit LSN that indicates
+ * the iclog that needs to be flushed to stable storage. If the caller needs
+ * a synchronous log force, we will wait on the iclog with the LSN returned by
+ * xlog_cil_force_seq() to be completed.
*/
int
-xfs_log_force_lsn(
+xfs_log_force_seq(
struct xfs_mount *mp,
- xfs_lsn_t lsn,
+ xfs_csn_t seq,
uint flags,
int *log_flushed)
{
+ struct xlog *log = mp->m_log;
+ xfs_lsn_t lsn;
int ret;
- ASSERT(lsn != 0);
+ ASSERT(seq != 0);
XFS_STATS_INC(mp, xs_log_force);
- trace_xfs_log_force(mp, lsn, _RET_IP_);
+ trace_xfs_log_force(mp, seq, _RET_IP_);
- lsn = xlog_cil_force_lsn(mp->m_log, lsn);
+ lsn = xlog_cil_force_seq(log, seq);
if (lsn == NULLCOMMITLSN)
return 0;
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
- if (ret == -EAGAIN)
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
+ if (ret == -EAGAIN) {
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
+ }
return ret;
}
/*
- * Called when we want to mark the current iclog as being ready to sync to
- * disk.
- */
-STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- assert_spin_locked(&log->l_icloglock);
-
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
- xlog_state_switch_iclogs(log, iclog, 0);
- } else {
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR);
- }
-}
-
-
-/*****************************************************************************
- *
- * TICKET functions
- *
- *****************************************************************************
- */
-
-/*
* Free a used ticket when its refcount falls to zero.
*/
void
@@ -3470,7 +3407,7 @@ xfs_log_ticket_put(
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_cache_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_cache, ticket);
}
xlog_ticket_t *
@@ -3486,12 +3423,12 @@ xfs_log_ticket_get(
* Figure out the total log space unit (in bytes) that would be
* required for a log ticket.
*/
-int
-xfs_log_calc_unit_res(
- struct xfs_mount *mp,
- int unit_bytes)
+static int
+xlog_calc_unit_res(
+ struct xlog *log,
+ int unit_bytes,
+ int *niclogs)
{
- struct xlog *log = mp->m_log;
int iclog_space;
uint num_headers;
@@ -3567,18 +3504,22 @@ xfs_log_calc_unit_res(
/* for commit-rec LR header - note: padding will subsume the ophdr */
unit_bytes += log->l_iclog_hsize;
- /* for roundoff padding for transaction data and one for commit record */
- if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
- /* log su roundoff */
- unit_bytes += 2 * mp->m_sb.sb_logsunit;
- } else {
- /* BB roundoff */
- unit_bytes += 2 * BBSIZE;
- }
+ /* roundoff padding for transaction data and one for commit record */
+ unit_bytes += 2 * log->l_iclog_roundoff;
+ if (niclogs)
+ *niclogs = num_headers;
return unit_bytes;
}
+int
+xfs_log_calc_unit_res(
+ struct xfs_mount *mp,
+ int unit_bytes)
+{
+ return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
+}
+
/*
* Allocate and initialise a new log ticket.
*/
@@ -3587,18 +3528,14 @@ xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
int cnt,
- char client,
- bool permanent,
- xfs_km_flags_t alloc_flags)
+ bool permanent)
{
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
- if (!tic)
- return NULL;
+ tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
- unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+ unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
@@ -3607,49 +3544,15 @@ xlog_ticket_alloc(
tic->t_curr_res = unit_res;
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
- tic->t_tid = prandom_u32();
- tic->t_clientid = client;
- tic->t_flags = XLOG_TIC_INITED;
+ tic->t_tid = get_random_u32();
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- xlog_tic_reset_res(tic);
-
return tic;
}
-
-/******************************************************************************
- *
- * Log debug routines
- *
- ******************************************************************************
- */
#if defined(DEBUG)
/*
- * Make sure that the destination ptr is within the valid data region of
- * one of the iclogs. This uses backup pointers stored in a different
- * part of the log in case we trash the log structure.
- */
-STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr)
-{
- int i;
- int good_ptr = 0;
-
- for (i = 0; i < log->l_iclog_bufs; i++) {
- if (ptr >= log->l_iclog_bak[i] &&
- ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
- good_ptr++;
- }
-
- if (!good_ptr)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-}
-
-/*
* Check to make sure the grant write head didn't just over lap the tail. If
* the cycles are the same, we can't be overlapping. Otherwise, make sure that
* the cycles differ by exactly one and check the byte count.
@@ -3671,17 +3574,15 @@ xlog_verify_grant_tail(
xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
if (tail_cycle != cycle) {
if (cycle - 1 != tail_cycle &&
- !(log->l_flags & XLOG_TAIL_WARN)) {
+ !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"%s: cycle - 1 != tail_cycle", __func__);
- log->l_flags |= XLOG_TAIL_WARN;
}
if (space > BBTOB(tail_blocks) &&
- !(log->l_flags & XLOG_TAIL_WARN)) {
+ !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"%s: space > BBTOB(tail_blocks)", __func__);
- log->l_flags |= XLOG_TAIL_WARN;
}
}
}
@@ -3690,10 +3591,10 @@ xlog_verify_grant_tail(
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t tail_lsn)
+ struct xlog_in_core *iclog)
{
- int blocks;
+ xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
+ int blocks;
if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
blocks =
@@ -3710,7 +3611,7 @@ xlog_verify_tail_lsn(
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
-} /* xlog_verify_tail_lsn */
+}
/*
* Perform a number of checks on the iclog before writing to disk.
@@ -3778,7 +3679,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3789,11 +3690,12 @@ xlog_verify_iclog(
iclog->ic_header.h_cycle_data[idx]);
}
}
- if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
- "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
- __func__, clientid, ophead,
+ "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+ __func__, i, clientid, ophead,
(unsigned long)field_offset);
+ }
/* check length */
p = &ophead->oh_len;
@@ -3801,8 +3703,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((uintptr_t)&ophead->oh_len -
- (uintptr_t)iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3813,110 +3714,87 @@ xlog_verify_iclog(
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
-} /* xlog_verify_iclog */
-#endif
-
-/*
- * Mark all iclogs IOERROR. l_icloglock is held by the caller.
- */
-STATIC int
-xlog_state_ioerror(
- struct xlog *log)
-{
- xlog_in_core_t *iclog, *ic;
-
- iclog = log->l_iclog;
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- /*
- * Mark all the incore logs IOERROR.
- * From now on, no log flushes will result.
- */
- ic = iclog;
- do {
- ic->ic_state = XLOG_STATE_IOERROR;
- ic = ic->ic_next;
- } while (ic != iclog);
- return 0;
- }
- /*
- * Return non-zero, if state transition has already happened.
- */
- return 1;
}
+#endif
/*
- * This is called from xfs_force_shutdown, when we're forcibly
- * shutting down the filesystem, typically because of an IO error.
+ * Perform a forced shutdown on the log.
+ *
+ * This can be called from low level log code to trigger a shutdown, or from the
+ * high level mount shutdown code when the mount shuts down.
+ *
* Our main objectives here are to make sure that:
- * a. if !logerror, flush the logs to disk. Anything modified
- * after this is ignored.
- * b. the filesystem gets marked 'SHUTDOWN' for all interested
- * parties to find out, 'atomically'.
- * c. those who're sleeping on log reservations, pinned objects and
- * other resources get woken up, and be told the bad news.
- * d. nothing new gets queued up after (b) and (c) are done.
+ * a. if the shutdown was not due to a log IO error, flush the logs to
+ * disk. Anything modified after this is ignored.
+ * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested
+ * parties to find out. Nothing new gets queued after this is done.
+ * c. Tasks sleeping on log reservations, pinned objects and
+ * other resources get woken up.
+ * d. The mount is also marked as shut down so that log triggered shutdowns
+ * still behave the same as if they called xfs_forced_shutdown().
*
- * Note: for the !logerror case we need to flush the regions held in memory out
- * to disk first. This needs to be done before the log is marked as shutdown,
- * otherwise the iclog writes will fail.
+ * Return true if the shutdown cause was a log IO error and we actually shut the
+ * log down.
*/
-int
-xfs_log_force_umount(
- struct xfs_mount *mp,
- int logerror)
+bool
+xlog_force_shutdown(
+ struct xlog *log,
+ uint32_t shutdown_flags)
{
- struct xlog *log;
- int retval;
-
- log = mp->m_log;
-
- /*
- * If this happens during log recovery, don't worry about
- * locking; the log isn't open for business yet.
- */
- if (!log ||
- log->l_flags & XLOG_ACTIVE_RECOVERY) {
- mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
- if (mp->m_sb_bp)
- mp->m_sb_bp->b_flags |= XBF_DONE;
- return 0;
- }
+ bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
- /*
- * Somebody could've already done the hard work for us.
- * No need to get locks for this.
- */
- if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
- ASSERT(XLOG_FORCED_SHUTDOWN(log));
- return 1;
- }
+ if (!log)
+ return false;
/*
* Flush all the completed transactions to disk before marking the log
- * being shut down. We need to do it in this order to ensure that
- * completed operations are safely on disk before we shut down, and that
- * we don't have to issue any buffer IO after the shutdown flags are set
- * to guarantee this.
+ * being shut down. We need to do this first as shutting down the log
+ * before the force will prevent the log force from flushing the iclogs
+ * to disk.
+ *
+ * When we are in recovery, there are no transactions to flush, and
+ * we don't want to touch the log because we don't want to perturb the
+ * current head/tail for future recovery attempts. Hence we need to
+ * avoid a log force in this case.
+ *
+ * If we are shutting down due to a log IO error, then we must avoid
+ * trying to write the log as that may just result in more IO errors and
+ * an endless shutdown/force loop.
*/
- if (!logerror)
- xfs_log_force(mp, XFS_LOG_SYNC);
+ if (!log_error && !xlog_in_recovery(log))
+ xfs_log_force(log->l_mp, XFS_LOG_SYNC);
/*
- * mark the filesystem and the as in a shutdown state and wake
- * everybody up to tell them the bad news.
+ * Atomically set the shutdown state. If the shutdown state is already
+ * set, there someone else is performing the shutdown and so we are done
+ * here. This should never happen because we should only ever get called
+ * once by the first shutdown caller.
+ *
+ * Much of the log state machine transitions assume that shutdown state
+ * cannot change once they hold the log->l_icloglock. Hence we need to
+ * hold that lock here, even though we use the atomic test_and_set_bit()
+ * operation to set the shutdown state.
*/
spin_lock(&log->l_icloglock);
- mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
- if (mp->m_sb_bp)
- mp->m_sb_bp->b_flags |= XBF_DONE;
+ if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) {
+ spin_unlock(&log->l_icloglock);
+ return false;
+ }
+ spin_unlock(&log->l_icloglock);
/*
- * Mark the log and the iclogs with IO error flags to prevent any
- * further log IO from being issued or completed.
+ * If this log shutdown also sets the mount shutdown state, issue a
+ * shutdown warning message.
*/
- log->l_flags |= XLOG_IO_ERROR;
- retval = xlog_state_ioerror(log);
- spin_unlock(&log->l_icloglock);
+ if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+"Filesystem has been shut down due to log error (0x%x).",
+ shutdown_flags);
+ xfs_alert(log->l_mp,
+"Please unmount the filesystem and rectify the problem(s).");
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+ }
/*
* We don't want anybody waiting for log reservations after this. That
@@ -3935,12 +3813,16 @@ xfs_log_force_umount(
* avoid races.
*/
spin_lock(&log->l_cilp->xc_push_lock);
+ wake_up_all(&log->l_cilp->xc_start_wait);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
- xlog_state_do_callback(log, true);
- /* return non-zero if log IOERROR transition had already happened */
- return retval;
+ spin_lock(&log->l_icloglock);
+ xlog_state_shutdown_callbacks(log);
+ spin_unlock(&log->l_icloglock);
+
+ wake_up_var(&log->l_opstate);
+ return log_error;
}
STATIC int
@@ -3978,7 +3860,7 @@ xfs_log_check_lsn(
* resets the in-core LSN. We can't validate in this mode, but
* modifications are not allowed anyways so just return true.
*/
- if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ if (xfs_has_norecovery(mp))
return true;
/*
@@ -4004,11 +3886,22 @@ xfs_log_check_lsn(
return valid;
}
-bool
-xfs_log_in_recovery(
- struct xfs_mount *mp)
+/*
+ * Notify the log that we're about to start using a feature that is protected
+ * by a log incompat feature flag. This will prevent log covering from
+ * clearing those flags.
+ */
+void
+xlog_use_incompat_feat(
+ struct xlog *log)
{
- struct xlog *log = mp->m_log;
+ down_read(&log->l_incompat_users);
+}
- return log->l_flags & XLOG_ACTIVE_RECOVERY;
+/* Notify the log that we've finished using log incompat features. */
+void
+xlog_drop_incompat_feat(
+ struct xlog *log)
+{
+ up_read(&log->l_incompat_users);
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e06805160f..2728886c2963 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -9,7 +9,8 @@
struct xfs_cil_ctx;
struct xfs_log_vec {
- struct xfs_log_vec *lv_next; /* next lv in build list */
+ struct list_head lv_list; /* CIL lv chain ptrs */
+ uint32_t lv_order_id; /* chain ordering info */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
@@ -21,46 +22,59 @@ struct xfs_log_vec {
#define XFS_LOG_VEC_ORDERED (-1)
-static inline void *
-xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- uint type)
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
{
- struct xfs_log_iovec *vec = *vecp;
+ return roundup(len, sizeof(uint32_t));
+}
+
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type);
- if (vec) {
- ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
- vec++;
- } else {
- vec = &lv->lv_iovecp[0];
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+ int data_len)
+{
+ struct xlog_op_header *oph = vec->i_addr;
+ int len;
+
+ /*
+ * Always round up the length to the correct alignment so callers don't
+ * need to know anything about this log vec layout requirement. This
+ * means we have to zero the area the data to be written does not cover.
+ * This is complicated by fact the payload region is offset into the
+ * logvec region by the opheader that tracks the payload.
+ */
+ len = xlog_calc_iovec_len(data_len);
+ if (len - data_len != 0) {
+ char *buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+ memset(buf + data_len, 0, len - data_len);
}
- vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ /*
+ * The opheader tracks aligned payload length, whilst the logvec tracks
+ * the overall region length.
+ */
+ oph->oh_len = cpu_to_be32(len);
- ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+ len += sizeof(struct xlog_op_header);
+ lv->lv_buf_len += len;
+ lv->lv_bytes += len;
+ vec->i_len = len;
- *vecp = vec;
- return vec->i_addr;
+ /* Catch buffer overruns */
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
}
/*
- * We need to make sure the next buffer is naturally aligned for the biggest
- * basic data type we put into it. We already accounted for this padding when
- * sizing the buffer.
- *
- * However, this padding does not get written into the log, and hence we have to
- * track the space used by the log vectors separately to prevent log space hangs
- * due to inaccurate accounting (i.e. a leak) of the used log space through the
- * CIL context ticket.
+ * Copy the amount of data requested by the caller into a new log iovec.
*/
-static inline void
-xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
-{
- lv->lv_buf_len += round_up(len, sizeof(uint64_t));
- lv->lv_bytes += len;
- vec->i_len = len;
-}
-
static inline void *
xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type, void *data, int len)
@@ -73,6 +87,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return buf;
}
+static inline void *
+xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ const struct xfs_log_iovec *src)
+{
+ return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
+}
+
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
@@ -104,13 +125,10 @@ struct xlog_ticket;
struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
+struct xlog;
-xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
-int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
+int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags,
int *log_forced);
int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
@@ -120,30 +138,29 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
-void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_release_iclog(struct xfs_mount *mp,
- struct xlog_in_core *iclog);
-int xfs_log_reserve(struct xfs_mount *mp,
- int length,
- int count,
- struct xlog_ticket **ticket,
- uint8_t clientid,
- bool permanent);
-int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-void xfs_log_unmount(struct xfs_mount *mp);
-int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+void xfs_log_space_wake(struct xfs_mount *mp);
+int xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+ struct xlog_ticket **ticket, bool permanent);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void xfs_log_unmount(struct xfs_mount *mp);
+bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn, bool regrant);
-void xlog_cil_process_committed(struct list_head *list, bool aborted);
+void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
-void xfs_log_quiesce(struct xfs_mount *mp);
+int xfs_log_quiesce(struct xfs_mount *mp);
+void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
-bool xfs_log_in_recovery(struct xfs_mount *);
+
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
+
+void xlog_use_incompat_feat(struct xlog *log);
+void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 48435cf2aa16..eccbfb99e894 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,17 +37,160 @@ xlog_cil_ticket_alloc(
{
struct xlog_ticket *tic;
- tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
- KM_NOFS);
+ tic = xlog_ticket_alloc(log, 0, 1, 0);
/*
* set the current reservation to zero so we know to steal the basic
* transaction overhead reservation from the first transaction commit.
*/
tic->t_curr_res = 0;
+ tic->t_iclog_hdrs = 0;
return tic;
}
+static inline void
+xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
+{
+ struct xlog *log = cil->xc_log;
+
+ atomic_set(&cil->xc_iclog_hdrs,
+ (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) /
+ (log->l_iclog_size - log->l_iclog_hsize)));
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+ struct xfs_cil *cil,
+ struct xfs_log_item *lip)
+{
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+ return false;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
+/*
+ * Unavoidable forward declaration - xlog_cil_push_work() calls
+ * xlog_cil_ctx_alloc() itself.
+ */
+static void xlog_cil_push_work(struct work_struct *work);
+
+static struct xfs_cil_ctx *
+xlog_cil_ctx_alloc(void)
+{
+ struct xfs_cil_ctx *ctx;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+ INIT_LIST_HEAD(&ctx->committing);
+ INIT_LIST_HEAD(&ctx->busy_extents);
+ INIT_LIST_HEAD(&ctx->log_items);
+ INIT_LIST_HEAD(&ctx->lv_chain);
+ INIT_WORK(&ctx->push_work, xlog_cil_push_work);
+ return ctx;
+}
+
+/*
+ * Aggregate the CIL per cpu structures into global counts, lists, etc and
+ * clear the percpu state ready for the next context to use. This is called
+ * from the push code with the context lock held exclusively, hence nothing else
+ * will be accessing or modifying the per-cpu counters.
+ */
+static void
+xlog_cil_push_pcp_aggregate(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+
+ ctx->ticket->t_curr_res += cilpcp->space_reserved;
+ cilpcp->space_reserved = 0;
+
+ if (!list_empty(&cilpcp->busy_extents)) {
+ list_splice_init(&cilpcp->busy_extents,
+ &ctx->busy_extents);
+ }
+ if (!list_empty(&cilpcp->log_items))
+ list_splice_init(&cilpcp->log_items, &ctx->log_items);
+
+ /*
+ * We're in the middle of switching cil contexts. Reset the
+ * counter we use to detect when the current context is nearing
+ * full.
+ */
+ cilpcp->space_used = 0;
+ }
+}
+
+/*
+ * Aggregate the CIL per-cpu space used counters into the global atomic value.
+ * This is called when the per-cpu counter aggregation will first pass the soft
+ * limit threshold so we can switch to atomic counter aggregation for accurate
+ * detection of hard limit traversal.
+ */
+static void
+xlog_cil_insert_pcp_aggregate(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
+ int count = 0;
+
+ /* Trigger atomic updates then aggregate only for the first caller */
+ if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
+ return;
+
+ for_each_online_cpu(cpu) {
+ int old, prev;
+
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ do {
+ old = cilpcp->space_used;
+ prev = cmpxchg(&cilpcp->space_used, old, 0);
+ } while (old != prev);
+ count += old;
+ }
+ atomic_add(count, &ctx->space_used);
+}
+
+static void
+xlog_cil_ctx_switch(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ xlog_cil_set_iclog_hdr_count(cil);
+ set_bit(XLOG_CIL_EMPTY, &cil->xc_flags);
+ set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags);
+ ctx->sequence = ++cil->xc_current_sequence;
+ ctx->cil = cil;
+ cil->xc_ctx = ctx;
+}
+
/*
* After the first stage of log recovery is done, we know where the head and
* tail of the log are. We need this log initialisation done before we can
@@ -64,6 +207,7 @@ xlog_cil_init_post_recovery(
{
log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
log->l_cilp->xc_ctx->sequence = 1;
+ xlog_cil_set_iclog_hdr_count(log->l_cilp);
}
static inline int
@@ -154,13 +298,20 @@ xlog_cil_alloc_shadow_bufs(
}
/*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
+ * We 64-bit align the length of each iovec so that the start of
+ * the next one is naturally aligned. We'll need to account for
+ * that slack space here.
+ *
+ * We also add the xlog_op_header to each region when
+ * formatting, but that's not accounted to the size of the item
+ * at this point. Hence we'll need an addition number of bytes
+ * for each vector to hold an opheader.
+ *
+ * Then round nbytes up to 64-bit alignment so that the initial
+ * buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs * sizeof(uint64_t);
+ nbytes += niovecs *
+ (sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
/*
@@ -176,19 +327,19 @@ xlog_cil_alloc_shadow_bufs(
*/
if (!lip->li_lv_shadow ||
buf_size > lip->li_lv_shadow->lv_size) {
-
/*
* We free and allocate here as a realloc would copy
- * unnecessary data. We don't use kmem_zalloc() for the
+ * unnecessary data. We don't use kvzalloc() for the
* same reason - we don't need to zero the data area in
* the buffer, only the log vector header and the iovec
* storage.
*/
kmem_free(lip->li_lv_shadow);
+ lv = xlog_kvmalloc(buf_size);
- lv = kmem_alloc_large(buf_size, KM_NOFS);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
+ INIT_LIST_HEAD(&lv->lv_list);
lv->lv_item = lip;
lv->lv_size = buf_size;
if (ordered)
@@ -204,7 +355,6 @@ xlog_cil_alloc_shadow_bufs(
else
lv->lv_buf_len = 0;
lv->lv_bytes = 0;
- lv->lv_next = NULL;
}
/* Ensure the lv is set up according to ->iop_size */
@@ -218,29 +368,25 @@ xlog_cil_alloc_shadow_bufs(
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
+ * log space it will consume, and if it is a new item pin it as well.
*/
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
struct xfs_log_vec *lv,
struct xfs_log_vec *old_lv,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
- *diff_iovecs += lv->lv_niovecs;
- }
/*
* If there is no old LV, this is the first time we've seen the item in
* this CIL context and so we need to pin it. If we are replacing the
* old_lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
- * shadow buffer, so update the the pointer to it appropriately.
+ * shadow buffer, so update the pointer to it appropriately.
*/
if (!old_lv) {
if (lv->lv_item->li_ops->iop_pin)
@@ -250,7 +396,6 @@ xfs_cil_prepare_item(
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
- *diff_iovecs -= old_lv->lv_niovecs;
lv->lv_item->li_lv_shadow = old_lv;
}
@@ -299,12 +444,10 @@ static void
xlog_cil_insert_format_items(
struct xlog *log,
struct xfs_trans *tp,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
struct xfs_log_item *lip;
-
/* Bail out if we didn't find a log item. */
if (list_empty(&tp->t_items)) {
ASSERT(0);
@@ -338,7 +481,6 @@ xlog_cil_insert_format_items(
if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv;
- lv->lv_next = NULL;
if (ordered)
goto insert;
@@ -347,7 +489,6 @@ xlog_cil_insert_format_items(
* set the item up as though it is a new insertion so
* that the space reservation accounting is correct.
*/
- *diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
/* Ensure the lv is set up according to ->iop_size */
@@ -372,11 +513,28 @@ xlog_cil_insert_format_items(
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+ xfs_cil_prepare_item(log, lv, old_lv, diff_len);
}
}
/*
+ * The use of lockless waitqueue_active() requires that the caller has
+ * serialised itself against the wakeup call in xlog_cil_push_work(). That
+ * can be done by either holding the push lock or the context lock.
+ */
+static inline bool
+xlog_cil_over_hard_limit(
+ struct xlog *log,
+ int32_t space_used)
+{
+ if (waitqueue_active(&log->l_cilp->xc_push_wait))
+ return true;
+ if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ return true;
+ return false;
+}
+
+/*
* Insert the log items into the CIL and calculate the difference in space
* consumed by the item. Add the space to the checkpoint ticket and calculate
* if the change requires additional log metadata. If it does, take that space
@@ -386,15 +544,17 @@ insert:
static void
xlog_cil_insert_items(
struct xlog *log,
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ uint32_t released_space)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
struct xfs_log_item *lip;
int len = 0;
- int diff_iovecs = 0;
- int iclog_space;
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
+ int space_used;
+ int order;
+ struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@@ -402,51 +562,114 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+ xlog_cil_insert_format_items(log, tp, &len);
+
+ /*
+ * Subtract the space released by intent cancelation from the space we
+ * consumed so that we remove it from the CIL space and add it back to
+ * the current transaction reservation context.
+ */
+ len -= released_space;
+
+ /*
+ * Grab the per-cpu pointer for the CIL before we start any accounting.
+ * That ensures that we are running with pre-emption disabled and so we
+ * can't be scheduled away between split sample/update operations that
+ * are done without outside locking to serialise them.
+ */
+ cilpcp = get_cpu_ptr(cil->xc_pcp);
+
+ /*
+ * We need to take the CIL checkpoint unit reservation on the first
+ * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't
+ * unnecessarily do an atomic op in the fast path here. We can clear the
+ * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that
+ * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit.
+ */
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) &&
+ test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+ ctx_res = ctx->ticket->t_unit_res;
- spin_lock(&cil->xc_cil_lock);
+ /*
+ * Check if we need to steal iclog headers. atomic_read() is not a
+ * locked atomic operation, so we can check the value before we do any
+ * real atomic ops in the fast path. If we've already taken the CIL unit
+ * reservation from this commit, we've already got one iclog header
+ * space reserved so we have to account for that otherwise we risk
+ * overrunning the reservation on this ticket.
+ *
+ * If the CIL is already at the hard limit, we might need more header
+ * space that originally reserved. So steal more header space from every
+ * commit that occurs once we are over the hard limit to ensure the CIL
+ * push won't run out of reservation space.
+ *
+ * This can steal more than we need, but that's OK.
+ *
+ * The cil->xc_ctx_lock provides the serialisation necessary for safely
+ * calling xlog_cil_over_hard_limit() in this context.
+ */
+ space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len;
+ if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
+ xlog_cil_over_hard_limit(log, space_used)) {
+ split_res = log->l_iclog_hsize +
+ sizeof(struct xlog_op_header);
+ if (ctx_res)
+ ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1);
+ else
+ ctx_res = split_res * tp->t_ticket->t_iclog_hdrs;
+ atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
+ }
+ cilpcp->space_reserved += ctx_res;
- /* account for space used by new iovec headers */
- iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
- len += iovhdr_res;
- ctx->nvecs += diff_iovecs;
+ /*
+ * Accurately account when over the soft limit, otherwise fold the
+ * percpu count into the global count if over the per-cpu threshold.
+ */
+ if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) {
+ atomic_add(len, &ctx->space_used);
+ } else if (cilpcp->space_used + len >
+ (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) {
+ space_used = atomic_add_return(cilpcp->space_used + len,
+ &ctx->space_used);
+ cilpcp->space_used = 0;
+ /*
+ * If we just transitioned over the soft limit, we need to
+ * transition to the global atomic counter.
+ */
+ if (space_used >= XLOG_CIL_SPACE_LIMIT(log))
+ xlog_cil_insert_pcp_aggregate(cil, ctx);
+ } else {
+ cilpcp->space_used += len;
+ }
/* attach the transaction to the CIL if it has any busy extents */
if (!list_empty(&tp->t_busy))
- list_splice_init(&tp->t_busy, &ctx->busy_extents);
+ list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
/*
- * Now transfer enough transaction reservation to the context ticket
- * for the checkpoint. The context ticket is special - the unit
- * reservation has to grow as well as the current reservation as we
- * steal from tickets so we can correctly determine the space used
- * during the transaction commit.
+ * Now update the order of everything modified in the transaction
+ * and insert items into the CIL if they aren't already there.
+ * We do this here so we only need to take the CIL lock once during
+ * the transaction commit.
*/
- if (ctx->ticket->t_curr_res == 0) {
- ctx_res = ctx->ticket->t_unit_res;
- ctx->ticket->t_curr_res = ctx_res;
- tp->t_ticket->t_curr_res -= ctx_res;
- }
+ order = atomic_inc_return(&ctx->order_id);
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
+ /* Skip items which aren't dirty in this transaction. */
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ continue;
- /* do we need space for more log record headers? */
- iclog_space = log->l_iclog_size - log->l_iclog_hsize;
- if (len > 0 && (ctx->space_used / iclog_space !=
- (ctx->space_used + len) / iclog_space)) {
- split_res = (len + iclog_space - 1) / iclog_space;
- /* need to take into account split region headers, too */
- split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
- ctx->ticket->t_unit_res += split_res;
- ctx->ticket->t_curr_res += split_res;
- tp->t_ticket->t_curr_res -= split_res;
- ASSERT(tp->t_ticket->t_curr_res >= len);
+ lip->li_order_id = order;
+ if (!list_empty(&lip->li_cil))
+ continue;
+ list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
- tp->t_ticket->t_curr_res -= len;
- ctx->space_used += len;
+ put_cpu_ptr(cilpcp);
/*
* If we've overrun the reservation, dump the tx details before we move
* the log items. Shutdown is imminent...
*/
+ tp->t_ticket->t_curr_res -= ctx_res + len;
if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
xfs_warn(log->l_mp, "Transaction log reservation overrun:");
xfs_warn(log->l_mp,
@@ -456,44 +679,20 @@ xlog_cil_insert_items(
split_res);
xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
xlog_print_trans(tp);
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
-
- /*
- * Now (re-)position everything modified at the tail of the CIL.
- * We do this here so we only need to take the CIL lock once during
- * the transaction commit.
- */
- list_for_each_entry(lip, &tp->t_items, li_trans) {
-
- /* Skip items which aren't dirty in this transaction. */
- if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
- continue;
-
- /*
- * Only move the item if it isn't already at the tail. This is
- * to prevent a transient list_empty() state when reinserting
- * an item that is already the only item in the CIL.
- */
- if (!list_is_last(&lip->li_cil, &cil->xc_cil))
- list_move_tail(&lip->li_cil, &cil->xc_cil);
- }
-
- spin_unlock(&cil->xc_cil_lock);
-
- if (tp->t_ticket->t_curr_res < 0)
- xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
static void
xlog_cil_free_logvec(
- struct xfs_log_vec *log_vector)
+ struct list_head *lv_chain)
{
struct xfs_log_vec *lv;
- for (lv = log_vector; lv; ) {
- struct xfs_log_vec *next = lv->lv_next;
+ while (!list_empty(lv_chain)) {
+ lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
+ list_del_init(&lv->lv_list);
kmem_free(lv);
- lv = next;
}
}
@@ -536,7 +735,7 @@ xlog_discard_busy_extents(
struct blk_plug plug;
int error = 0;
- ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+ ASSERT(xfs_has_discard(mp));
blk_start_plug(&plug);
list_for_each_entry(busyp, list, list) {
@@ -546,7 +745,7 @@ xlog_discard_busy_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
@@ -574,10 +773,10 @@ xlog_discard_busy_extents(
*/
static void
xlog_cil_committed(
- struct xfs_cil_ctx *ctx,
- bool abort)
+ struct xfs_cil_ctx *ctx)
{
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
+ bool abort = xlog_is_shutdown(ctx->cil->xc_log);
/*
* If the I/O failed, we're aborting the commit and already shutdown.
@@ -588,22 +787,23 @@ xlog_cil_committed(
*/
if (abort) {
spin_lock(&ctx->cil->xc_push_lock);
+ wake_up_all(&ctx->cil->xc_start_wait);
wake_up_all(&ctx->cil->xc_commit_wait);
spin_unlock(&ctx->cil->xc_push_lock);
}
- xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+ xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
ctx->start_lsn, abort);
xfs_extent_busy_sort(&ctx->busy_extents);
xfs_extent_busy_clear(mp, &ctx->busy_extents,
- (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+ xfs_has_discard(mp) && !abort);
spin_lock(&ctx->cil->xc_push_lock);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
- xlog_cil_free_logvec(ctx->lv_chain);
+ xlog_cil_free_logvec(&ctx->lv_chain);
if (!list_empty(&ctx->busy_extents))
xlog_discard_busy_extents(mp, ctx);
@@ -613,69 +813,414 @@ xlog_cil_committed(
void
xlog_cil_process_committed(
- struct list_head *list,
- bool aborted)
+ struct list_head *list)
{
struct xfs_cil_ctx *ctx;
while ((ctx = list_first_entry_or_null(list,
struct xfs_cil_ctx, iclog_entry))) {
list_del(&ctx->iclog_entry);
- xlog_cil_committed(ctx, aborted);
+ xlog_cil_committed(ctx);
+ }
+}
+
+/*
+* Record the LSN of the iclog we were just granted space to start writing into.
+* If the context doesn't have a start_lsn recorded, then this iclog will
+* contain the start record for the checkpoint. Otherwise this write contains
+* the commit record for the checkpoint.
+*/
+void
+xlog_cil_set_ctx_write_state(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_in_core *iclog)
+{
+ struct xfs_cil *cil = ctx->cil;
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+
+ ASSERT(!ctx->commit_lsn);
+ if (!ctx->start_lsn) {
+ spin_lock(&cil->xc_push_lock);
+ /*
+ * The LSN we need to pass to the log items on transaction
+ * commit is the LSN reported by the first log vector write, not
+ * the commit lsn. If we use the commit record lsn then we can
+ * move the grant write head beyond the tail LSN and overwrite
+ * it.
+ */
+ ctx->start_lsn = lsn;
+ wake_up_all(&cil->xc_start_wait);
+ spin_unlock(&cil->xc_push_lock);
+
+ /*
+ * Make sure the metadata we are about to overwrite in the log
+ * has been flushed to stable storage before this iclog is
+ * issued.
+ */
+ spin_lock(&cil->xc_log->l_icloglock);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ spin_unlock(&cil->xc_log->l_icloglock);
+ return;
+ }
+
+ /*
+ * Take a reference to the iclog for the context so that we still hold
+ * it when xlog_write is done and has released it. This means the
+ * context controls when the iclog is released for IO.
+ */
+ atomic_inc(&iclog->ic_refcnt);
+
+ /*
+ * xlog_state_get_iclog_space() guarantees there is enough space in the
+ * iclog for an entire commit record, so we can attach the context
+ * callbacks now. This needs to be done before we make the commit_lsn
+ * visible to waiters so that checkpoints with commit records in the
+ * same iclog order their IO completion callbacks in the same order that
+ * the commit records appear in the iclog.
+ */
+ spin_lock(&cil->xc_log->l_icloglock);
+ list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks);
+ spin_unlock(&cil->xc_log->l_icloglock);
+
+ /*
+ * Now we can record the commit LSN and wake anyone waiting for this
+ * sequence to have the ordered commit record assigned to a physical
+ * location in the log.
+ */
+ spin_lock(&cil->xc_push_lock);
+ ctx->commit_iclog = iclog;
+ ctx->commit_lsn = lsn;
+ wake_up_all(&cil->xc_commit_wait);
+ spin_unlock(&cil->xc_push_lock);
+}
+
+
+/*
+ * Ensure that the order of log writes follows checkpoint sequence order. This
+ * relies on the context LSN being zero until the log write has guaranteed the
+ * LSN that the log write will start at via xlog_state_get_iclog_space().
+ */
+enum _record_type {
+ _START_RECORD,
+ _COMMIT_RECORD,
+};
+
+static int
+xlog_cil_order_write(
+ struct xfs_cil *cil,
+ xfs_csn_t sequence,
+ enum _record_type record)
+{
+ struct xfs_cil_ctx *ctx;
+
+restart:
+ spin_lock(&cil->xc_push_lock);
+ list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ /*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (xlog_is_shutdown(cil->xc_log)) {
+ spin_unlock(&cil->xc_push_lock);
+ return -EIO;
+ }
+
+ /*
+ * Higher sequences will wait for this one so skip them.
+ * Don't wait for our own sequence, either.
+ */
+ if (ctx->sequence >= sequence)
+ continue;
+
+ /* Wait until the LSN for the record has been recorded. */
+ switch (record) {
+ case _START_RECORD:
+ if (!ctx->start_lsn) {
+ xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock);
+ goto restart;
+ }
+ break;
+ case _COMMIT_RECORD:
+ if (!ctx->commit_lsn) {
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
+ goto restart;
+ }
+ break;
+ }
+ }
+ spin_unlock(&cil->xc_push_lock);
+ return 0;
+}
+
+/*
+ * Write out the log vector change now attached to the CIL context. This will
+ * write a start record that needs to be strictly ordered in ascending CIL
+ * sequence order so that log recovery will always use in-order start LSNs when
+ * replaying checkpoints.
+ */
+static int
+xlog_cil_write_chain(
+ struct xfs_cil_ctx *ctx,
+ uint32_t chain_len)
+{
+ struct xlog *log = ctx->cil->xc_log;
+ int error;
+
+ error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
+ if (error)
+ return error;
+ return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len);
+}
+
+/*
+ * Write out the commit record of a checkpoint transaction to close off a
+ * running log write. These commit records are strictly ordered in ascending CIL
+ * sequence order so that log recovery will always replay the checkpoints in the
+ * correct order.
+ */
+static int
+xlog_cil_write_commit_record(
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog *log = ctx->cil->xc_log;
+ struct xlog_op_header ophdr = {
+ .oh_clientid = XFS_TRANSACTION,
+ .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+ .oh_flags = XLOG_COMMIT_TRANS,
+ };
+ struct xfs_log_iovec reg = {
+ .i_addr = &ophdr,
+ .i_len = sizeof(struct xlog_op_header),
+ .i_type = XLOG_REG_TYPE_COMMIT,
+ };
+ struct xfs_log_vec vec = {
+ .lv_niovecs = 1,
+ .lv_iovecp = &reg,
+ };
+ int error;
+ LIST_HEAD(lv_chain);
+ list_add(&vec.lv_list, &lv_chain);
+
+ if (xlog_is_shutdown(log))
+ return -EIO;
+
+ error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD);
+ if (error)
+ return error;
+
+ /* account for space used by record data */
+ ctx->ticket->t_curr_res -= reg.i_len;
+ error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len);
+ if (error)
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+}
+
+struct xlog_cil_trans_hdr {
+ struct xlog_op_header oph[2];
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction. We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_cil_trans_hdr *hdr,
+ struct xfs_log_vec *lvhdr,
+ int num_iovecs)
+{
+ struct xlog_ticket *tic = ctx->ticket;
+ __be32 tid = cpu_to_be32(tic->t_tid);
+
+ memset(hdr, 0, sizeof(*hdr));
+
+ /* Log start record */
+ hdr->oph[0].oh_tid = tid;
+ hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+ /* log iovec region pointer */
+ hdr->lhdr[0].i_addr = &hdr->oph[0];
+ hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+ hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+ /* log opheader */
+ hdr->oph[1].oh_tid = tid;
+ hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+ /* transaction header in host byte order format */
+ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+ hdr->thdr.th_tid = tic->t_tid;
+ hdr->thdr.th_num_items = num_iovecs;
+
+ /* log iovec region pointer */
+ hdr->lhdr[1].i_addr = &hdr->oph[1];
+ hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_trans_header);
+ hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+ lvhdr->lv_niovecs = 2;
+ lvhdr->lv_iovecp = &hdr->lhdr[0];
+ lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+
+ tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * CIL item reordering compare function. We want to order in ascending ID order,
+ * but we want to leave items with the same ID in the order they were added to
+ * the list. This is important for operations like reflink where we log 4 order
+ * dependent intents in a single transaction when we overwrite an existing
+ * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
+ * CUI (inc), BUI(remap)...
+ */
+static int
+xlog_cil_order_cmp(
+ void *priv,
+ const struct list_head *a,
+ const struct list_head *b)
+{
+ struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list);
+ struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list);
+
+ return l1->lv_order_id > l2->lv_order_id;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+ struct xfs_cil_ctx *ctx,
+ struct list_head *whiteouts,
+ uint32_t *num_iovecs,
+ uint32_t *num_bytes)
+{
+ while (!list_empty(&ctx->log_items)) {
+ struct xfs_log_item *item;
+ struct xfs_log_vec *lv;
+
+ item = list_first_entry(&ctx->log_items,
+ struct xfs_log_item, li_cil);
+
+ if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+ list_move(&item->li_cil, whiteouts);
+ trace_xfs_cil_whiteout_skip(item);
+ continue;
+ }
+
+ lv = item->li_lv;
+ lv->lv_order_id = item->li_order_id;
+
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ *num_bytes += lv->lv_bytes;
+ *num_iovecs += lv->lv_niovecs;
+ list_add_tail(&lv->lv_list, &ctx->lv_chain);
+
+ list_del_init(&item->li_cil);
+ item->li_order_id = 0;
+ item->li_lv = NULL;
+ }
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+ struct list_head *whiteouts)
+{
+ while (!list_empty(whiteouts)) {
+ struct xfs_log_item *item = list_first_entry(whiteouts,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ trace_xfs_cil_whiteout_unpin(item);
+ item->li_ops->iop_unpin(item, 1);
}
}
/*
- * Push the Committed Item List to the log. If @push_seq flag is zero, then it
- * is a background flush and so we can chose to ignore it. Otherwise, if the
- * current sequence is the same as @push_seq we need to do a flush. If
- * @push_seq is less than the current sequence, then it has already been
+ * Push the Committed Item List to the log.
+ *
+ * If the current sequence is the same as xc_push_seq we need to do a flush. If
+ * xc_push_seq is less than the current sequence, then it has already been
* flushed and we don't need to do anything - the caller will wait for it to
* complete if necessary.
*
- * @push_seq is a value rather than a flag because that allows us to do an
- * unlocked check of the sequence number for a match. Hence we can allows log
- * forces to run racily and not issue pushes for the same sequence twice. If we
- * get a race between multiple pushes for the same sequence they will block on
- * the first one and then abort, hence avoiding needless pushes.
+ * xc_push_seq is checked unlocked against the sequence number for a match.
+ * Hence we can allow log forces to run racily and not issue pushes for the
+ * same sequence twice. If we get a race between multiple pushes for the same
+ * sequence they will block on the first one and then abort, hence avoiding
+ * needless pushes.
*/
-STATIC int
-xlog_cil_push(
- struct xlog *log)
+static void
+xlog_cil_push_work(
+ struct work_struct *work)
{
- struct xfs_cil *cil = log->l_cilp;
- struct xfs_log_vec *lv;
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil_ctx *ctx =
+ container_of(work, struct xfs_cil_ctx, push_work);
+ struct xfs_cil *cil = ctx->cil;
+ struct xlog *log = cil->xc_log;
struct xfs_cil_ctx *new_ctx;
- struct xlog_in_core *commit_iclog;
- struct xlog_ticket *tic;
- int num_iovecs;
+ int num_iovecs = 0;
+ int num_bytes = 0;
int error = 0;
- struct xfs_trans_header thdr;
- struct xfs_log_iovec lhdr;
- struct xfs_log_vec lvhdr = { NULL };
- xfs_lsn_t commit_lsn;
- xfs_lsn_t push_seq;
-
- if (!cil)
- return 0;
-
- new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
+ struct xlog_cil_trans_hdr thdr;
+ struct xfs_log_vec lvhdr = {};
+ xfs_csn_t push_seq;
+ bool push_commit_stable;
+ LIST_HEAD (whiteouts);
+ struct xlog_ticket *ticket;
+
+ new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
down_write(&cil->xc_ctx_lock);
- ctx = cil->xc_ctx;
spin_lock(&cil->xc_push_lock);
push_seq = cil->xc_push_seq;
ASSERT(push_seq <= ctx->sequence);
+ push_commit_stable = cil->xc_push_commit_stable;
+ cil->xc_push_commit_stable = false;
+
+ /*
+ * As we are about to switch to a new, empty CIL context, we no longer
+ * need to throttle tasks on CIL space overruns. Wake any waiters that
+ * the hard push throttle may have caught so they can start committing
+ * to the new context. The ctx->xc_push_lock provides the serialisation
+ * necessary for safely using the lockless waitqueue_active() check in
+ * this context.
+ */
+ if (waitqueue_active(&cil->xc_push_wait))
+ wake_up_all(&cil->xc_push_wait);
+
+ xlog_cil_push_pcp_aggregate(cil, ctx);
/*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
*/
- if (list_empty(&cil->xc_cil)) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
cil->xc_push_seq = 0;
spin_unlock(&cil->xc_push_lock);
goto out_skip;
@@ -683,7 +1228,7 @@ xlog_cil_push(
/* check for a previously pushed sequence */
- if (push_seq < cil->xc_ctx->sequence) {
+ if (push_seq < ctx->sequence) {
spin_unlock(&cil->xc_push_lock);
goto out_skip;
}
@@ -715,43 +1260,10 @@ xlog_cil_push(
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
- /*
- * pull all the log vectors off the items in the CIL, and
- * remove the items from the CIL. We don't need the CIL lock
- * here because it's only needed on the transaction commit
- * side which is currently locked out by the flush lock.
- */
- lv = NULL;
- num_iovecs = 0;
- while (!list_empty(&cil->xc_cil)) {
- struct xfs_log_item *item;
-
- item = list_first_entry(&cil->xc_cil,
- struct xfs_log_item, li_cil);
- list_del_init(&item->li_cil);
- if (!ctx->lv_chain)
- ctx->lv_chain = item->li_lv;
- else
- lv->lv_next = item->li_lv;
- lv = item->li_lv;
- item->li_lv = NULL;
- num_iovecs += lv->lv_niovecs;
- }
+ xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes);
/*
- * initialise the new context and attach it to the CIL. Then attach
- * the current context to the CIL committing lsit so it can be found
- * during log forces to extract the commit lsn of the sequence that
- * needs to be forced.
- */
- INIT_LIST_HEAD(&new_ctx->committing);
- INIT_LIST_HEAD(&new_ctx->busy_extents);
- new_ctx->sequence = ctx->sequence + 1;
- new_ctx->cil = cil;
- cil->xc_ctx = new_ctx;
-
- /*
- * The switch is now done, so we can drop the context lock and move out
+ * Switch the contexts so we can drop the context lock and move out
* of a shared context. We can't just go straight to the commit record,
* though - we need to synchronise with previous and future commits so
* that the commit records are correctly ordered in the log to ensure
@@ -769,126 +1281,136 @@ xlog_cil_push(
* that higher sequences will wait for us to write out a commit record
* before they do.
*
- * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+ * xfs_log_force_seq requires us to mirror the new sequence into the cil
* structure atomically with the addition of this sequence to the
* committing list. This also ensures that we can do unlocked checks
* against the current sequence in log forces without risking
* deferencing a freed context pointer.
*/
spin_lock(&cil->xc_push_lock);
- cil->xc_current_sequence = new_ctx->sequence;
+ xlog_cil_ctx_switch(cil, new_ctx);
spin_unlock(&cil->xc_push_lock);
up_write(&cil->xc_ctx_lock);
/*
+ * Sort the log vector chain before we add the transaction headers.
+ * This ensures we always have the transaction headers at the start
+ * of the chain.
+ */
+ list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp);
+
+ /*
* Build a checkpoint transaction header and write it to the log to
* begin the transaction. We need to account for the space used by the
* transaction header here as it is not accounted for in xlog_write().
- *
- * The LSN we need to pass to the log items on transaction commit is
- * the LSN reported by the first log vector write. If we use the commit
- * record lsn then we can move the tail beyond the grant write head.
+ * Add the lvhdr to the head of the lv chain we pass to xlog_write() so
+ * it gets written into the iclog first.
*/
- tic = ctx->ticket;
- thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
- thdr.th_type = XFS_TRANS_CHECKPOINT;
- thdr.th_tid = tic->t_tid;
- thdr.th_num_items = num_iovecs;
- lhdr.i_addr = &thdr;
- lhdr.i_len = sizeof(xfs_trans_header_t);
- lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
- tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
-
- lvhdr.lv_niovecs = 1;
- lvhdr.lv_iovecp = &lhdr;
- lvhdr.lv_next = ctx->lv_chain;
-
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+ num_bytes += lvhdr.lv_bytes;
+ list_add(&lvhdr.lv_list, &ctx->lv_chain);
+
+ /*
+ * Take the lvhdr back off the lv_chain immediately after calling
+ * xlog_cil_write_chain() as it should not be passed to log IO
+ * completion.
+ */
+ error = xlog_cil_write_chain(ctx, num_bytes);
+ list_del(&lvhdr.lv_list);
+ if (error)
+ goto out_abort_free_ticket;
+
+ error = xlog_cil_write_commit_record(ctx);
if (error)
goto out_abort_free_ticket;
/*
- * now that we've written the checkpoint into the log, strictly
- * order the commit records so replay will get them in the right order.
+ * Grab the ticket from the ctx so we can ungrant it after releasing the
+ * commit_iclog. The ctx may be freed by the time we return from
+ * releasing the commit_iclog (i.e. checkpoint has been completed and
+ * callback run) so we can't reference the ctx after the call to
+ * xlog_state_release_iclog().
*/
-restart:
- spin_lock(&cil->xc_push_lock);
- list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
- /*
- * Avoid getting stuck in this loop because we were woken by the
- * shutdown, but then went back to sleep once already in the
- * shutdown state.
- */
- if (XLOG_FORCED_SHUTDOWN(log)) {
- spin_unlock(&cil->xc_push_lock);
- goto out_abort_free_ticket;
- }
+ ticket = ctx->ticket;
- /*
- * Higher sequences will wait for this one so skip them.
- * Don't wait for our own sequence, either.
- */
- if (new_ctx->sequence >= ctx->sequence)
- continue;
- if (!new_ctx->commit_lsn) {
+ /*
+ * If the checkpoint spans multiple iclogs, wait for all previous iclogs
+ * to complete before we submit the commit_iclog. We can't use state
+ * checks for this - ACTIVE can be either a past completed iclog or a
+ * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a
+ * past or future iclog awaiting IO or ordered IO completion to be run.
+ * In the latter case, if it's a future iclog and we wait on it, the we
+ * will hang because it won't get processed through to ic_force_wait
+ * wakeup until this commit_iclog is written to disk. Hence we use the
+ * iclog header lsn and compare it to the commit lsn to determine if we
+ * need to wait on iclogs or not.
+ */
+ spin_lock(&log->l_icloglock);
+ if (ctx->start_lsn != ctx->commit_lsn) {
+ xfs_lsn_t plsn;
+
+ plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn);
+ if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) {
/*
- * It is still being pushed! Wait for the push to
- * complete, then start again from the beginning.
+ * Waiting on ic_force_wait orders the completion of
+ * iclogs older than ic_prev. Hence we only need to wait
+ * on the most recent older iclog here.
*/
- xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
- goto restart;
+ xlog_wait_on_iclog(ctx->commit_iclog->ic_prev);
+ spin_lock(&log->l_icloglock);
}
- }
- spin_unlock(&cil->xc_push_lock);
-
- /* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
- if (commit_lsn == -1)
- goto out_abort;
- spin_lock(&commit_iclog->ic_callback_lock);
- if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
- spin_unlock(&commit_iclog->ic_callback_lock);
- goto out_abort;
+ /*
+ * We need to issue a pre-flush so that the ordering for this
+ * checkpoint is correctly preserved down to stable storage.
+ */
+ ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
}
- ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
- commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
- list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
- spin_unlock(&commit_iclog->ic_callback_lock);
/*
- * now the checkpoint commit is complete and we've attached the
- * callbacks to the iclog we can assign the commit LSN to the context
- * and wake up anyone who is waiting for the commit to complete.
+ * The commit iclog must be written to stable storage to guarantee
+ * journal IO vs metadata writeback IO is correctly ordered on stable
+ * storage.
+ *
+ * If the push caller needs the commit to be immediately stable and the
+ * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it
+ * will be written when released, switch it's state to WANT_SYNC right
+ * now.
*/
- spin_lock(&cil->xc_push_lock);
- ctx->commit_lsn = commit_lsn;
- wake_up_all(&cil->xc_commit_wait);
- spin_unlock(&cil->xc_push_lock);
+ ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
+ if (push_commit_stable &&
+ ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
+ ticket = ctx->ticket;
+ xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
- /* release the hounds! */
- return xfs_log_release_iclog(log->l_mp, commit_iclog);
+ /* Not safe to reference ctx now! */
+
+ spin_unlock(&log->l_icloglock);
+ xlog_cil_cleanup_whiteouts(&whiteouts);
+ xfs_log_ticket_ungrant(log, ticket);
+ return;
out_skip:
up_write(&cil->xc_ctx_lock);
xfs_log_ticket_put(new_ctx->ticket);
kmem_free(new_ctx);
- return 0;
+ return;
out_abort_free_ticket:
- xfs_log_ticket_put(tic);
-out_abort:
- xlog_cil_committed(ctx, true);
- return -EIO;
-}
-
-static void
-xlog_cil_push_work(
- struct work_struct *work)
-{
- struct xfs_cil *cil = container_of(work, struct xfs_cil,
- xc_push_work);
- xlog_cil_push(cil->xc_log);
+ ASSERT(xlog_is_shutdown(log));
+ xlog_cil_cleanup_whiteouts(&whiteouts);
+ if (!ctx->commit_iclog) {
+ xfs_log_ticket_ungrant(log, ctx->ticket);
+ xlog_cil_committed(ctx);
+ return;
+ }
+ spin_lock(&log->l_icloglock);
+ ticket = ctx->ticket;
+ xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
+ /* Not safe to reference ctx now! */
+ spin_unlock(&log->l_icloglock);
+ xfs_log_ticket_ungrant(log, ticket);
}
/*
@@ -900,28 +1422,65 @@ xlog_cil_push_work(
*/
static void
xlog_cil_push_background(
- struct xlog *log)
+ struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
+ int space_used = atomic_read(&cil->xc_ctx->space_used);
/*
* The cil won't be empty because we are called while holding the
- * context lock so whatever we added to the CIL will still be there
+ * context lock so whatever we added to the CIL will still be there.
*/
- ASSERT(!list_empty(&cil->xc_cil));
+ ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
/*
- * don't do a background push if we haven't used up all the
- * space available yet.
+ * We are done if:
+ * - we haven't used up all the space available yet; or
+ * - we've already queued up a push; and
+ * - we're not over the hard limit; and
+ * - nothing has been over the hard limit.
+ *
+ * If so, we don't need to take the push lock as there's nothing to do.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (space_used < XLOG_CIL_SPACE_LIMIT(log) ||
+ (cil->xc_push_seq == cil->xc_current_sequence &&
+ space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) &&
+ !waitqueue_active(&cil->xc_push_wait))) {
+ up_read(&cil->xc_ctx_lock);
return;
+ }
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
- queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
}
+
+ /*
+ * Drop the context lock now, we can't hold that if we need to sleep
+ * because we are over the blocking threshold. The push_lock is still
+ * held, so blocking threshold sleep/wakeup is still correctly
+ * serialised here.
+ */
+ up_read(&cil->xc_ctx_lock);
+
+ /*
+ * If we are well over the space limit, throttle the work that is being
+ * done until the push work on this context has begun. Enforce the hard
+ * throttle on all transaction commits once it has been activated, even
+ * if the committing transactions have resulted in the space usage
+ * dipping back down under the hard limit.
+ *
+ * The ctx->xc_push_lock provides the serialisation necessary for safely
+ * calling xlog_cil_over_hard_limit() in this context.
+ */
+ if (xlog_cil_over_hard_limit(log, space_used)) {
+ trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+ ASSERT(space_used < log->l_logsize);
+ xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
+ return;
+ }
+
spin_unlock(&cil->xc_push_lock);
}
@@ -929,13 +1488,26 @@ xlog_cil_push_background(
/*
* xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
* number that is passed. When it returns, the work will be queued for
- * @push_seq, but it won't be completed. The caller is expected to do any
- * waiting for push_seq to complete if it is required.
+ * @push_seq, but it won't be completed.
+ *
+ * If the caller is performing a synchronous force, we will flush the workqueue
+ * to get previously queued work moving to minimise the wait time they will
+ * undergo waiting for all outstanding pushes to complete. The caller is
+ * expected to do the required waiting for push_seq to complete.
+ *
+ * If the caller is performing an async push, we need to ensure that the
+ * checkpoint is fully flushed out of the iclogs when we finish the push. If we
+ * don't do this, then the commit record may remain sitting in memory in an
+ * ACTIVE iclog. This then requires another full log force to push to disk,
+ * which defeats the purpose of having an async, non-blocking CIL force
+ * mechanism. Hence in this case we need to pass a flag to the push work to
+ * indicate it needs to flush the commit record itself.
*/
static void
xlog_cil_push_now(
struct xlog *log,
- xfs_lsn_t push_seq)
+ xfs_lsn_t push_seq,
+ bool async)
{
struct xfs_cil *cil = log->l_cilp;
@@ -945,20 +1517,32 @@ xlog_cil_push_now(
ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
/* start on any pending background push to minimise wait time on it */
- flush_work(&cil->xc_push_work);
+ if (!async)
+ flush_workqueue(cil->xc_push_wq);
+
+ spin_lock(&cil->xc_push_lock);
+
+ /*
+ * If this is an async flush request, we always need to set the
+ * xc_push_commit_stable flag even if something else has already queued
+ * a push. The flush caller is asking for the CIL to be on stable
+ * storage when the next push completes, so regardless of who has queued
+ * the push, the flush requires stable semantics from it.
+ */
+ cil->xc_push_commit_stable = async;
/*
* If the CIL is empty or we've already pushed the sequence then
- * there's no work we need to do.
+ * there's no more work that we need to do.
*/
- spin_lock(&cil->xc_push_lock);
- if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) ||
+ push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
cil->xc_push_seq = push_seq;
- queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+ queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
spin_unlock(&cil->xc_push_lock);
}
@@ -970,13 +1554,50 @@ xlog_cil_empty(
bool empty = false;
spin_lock(&cil->xc_push_lock);
- if (list_empty(&cil->xc_cil))
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
empty = true;
spin_unlock(&cil->xc_push_lock);
return empty;
}
/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+ struct xfs_cil *cil,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *ilip, *next;
+ uint32_t len = 0;
+
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+ continue;
+
+ ilip = lip->li_ops->iop_intent(lip);
+ if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+ continue;
+ set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+ trace_xfs_cil_whiteout_mark(ilip);
+ len += ilip->li_lv->lv_bytes;
+ kmem_free(ilip->li_lv);
+ ilip->li_lv = NULL;
+
+ xfs_trans_del_item(lip);
+ lip->li_ops->iop_release(lip);
+ }
+ return len;
+}
+
+/*
* Commit a transaction with the given vector to the Committed Item List.
*
* To do this, we need to format the item, pin it in memory if required and
@@ -990,16 +1611,15 @@ xlog_cil_empty(
* allowed again.
*/
void
-xfs_log_commit_cil(
- struct xfs_mount *mp,
+xlog_cil_commit(
+ struct xlog *log,
struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn,
+ xfs_csn_t *commit_seq,
bool regrant)
{
- struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
- xfs_lsn_t xc_commit_lsn;
+ uint32_t released_space = 0;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1011,13 +1631,15 @@ xfs_log_commit_cil(
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
- xlog_cil_insert_items(log, tp);
+ if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+ released_space = xlog_cil_process_intents(cil, tp);
- xc_commit_lsn = cil->xc_ctx->sequence;
- if (commit_lsn)
- *commit_lsn = xc_commit_lsn;
+ xlog_cil_insert_items(log, tp, released_space);
- xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ if (regrant && !xlog_is_shutdown(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1036,27 +1658,51 @@ xfs_log_commit_cil(
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
if (lip->li_ops->iop_committing)
- lip->li_ops->iop_committing(lip, xc_commit_lsn);
+ lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence);
}
+ if (commit_seq)
+ *commit_seq = cil->xc_ctx->sequence;
+
+ /* xlog_cil_push_background() releases cil->xc_ctx_lock */
xlog_cil_push_background(log);
+}
- up_read(&cil->xc_ctx_lock);
+/*
+ * Flush the CIL to stable storage but don't wait for it to complete. This
+ * requires the CIL push to ensure the commit record for the push hits the disk,
+ * but otherwise is no different to a push done from a log force.
+ */
+void
+xlog_cil_flush(
+ struct xlog *log)
+{
+ xfs_csn_t seq = log->l_cilp->xc_current_sequence;
+
+ trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
+ xlog_cil_push_now(log, seq, true);
+
+ /*
+ * If the CIL is empty, make sure that any previous checkpoint that may
+ * still be in an active iclog is pushed to stable storage.
+ */
+ if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags))
+ xfs_log_force(log->l_mp, 0);
}
/*
* Conditionally push the CIL based on the sequence passed in.
*
- * We only need to push if we haven't already pushed the sequence
- * number given. Hence the only time we will trigger a push here is
- * if the push sequence is the same as the current context.
+ * We only need to push if we haven't already pushed the sequence number given.
+ * Hence the only time we will trigger a push here is if the push sequence is
+ * the same as the current context.
*
* We return the current commit lsn to allow the callers to determine if a
* iclog flush is necessary following this call.
*/
xfs_lsn_t
-xlog_cil_force_lsn(
+xlog_cil_force_seq(
struct xlog *log,
- xfs_lsn_t sequence)
+ xfs_csn_t sequence)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx;
@@ -1064,13 +1710,17 @@ xlog_cil_force_lsn(
ASSERT(sequence <= cil->xc_current_sequence);
+ if (!sequence)
+ sequence = cil->xc_current_sequence;
+ trace_xfs_log_force(log->l_mp, sequence, _RET_IP_);
+
/*
* check to see if we need to force out the current context.
* xlog_cil_push() handles racing pushes for the same sequence,
* so no need to deal with it here.
*/
restart:
- xlog_cil_push_now(log, sequence);
+ xlog_cil_push_now(log, sequence, false);
/*
* See if we can find a previous sequence still committing.
@@ -1085,7 +1735,7 @@ restart:
* shutdown, but then went back to sleep once already in the
* shutdown state.
*/
- if (XLOG_FORCED_SHUTDOWN(log))
+ if (xlog_is_shutdown(log))
goto out_shutdown;
if (ctx->sequence > sequence)
continue;
@@ -1094,6 +1744,7 @@ restart:
* It is still being pushed! Wait for the push to
* complete, then start again from the beginning.
*/
+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
goto restart;
}
@@ -1119,7 +1770,7 @@ restart:
* we would have found the context on the committing list.
*/
if (sequence == cil->xc_current_sequence &&
- !list_empty(&cil->xc_cil)) {
+ !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
spin_unlock(&cil->xc_push_lock);
goto restart;
}
@@ -1140,33 +1791,35 @@ out_shutdown:
}
/*
- * Check if the current log item was first committed in this sequence.
- * We can't rely on just the log item being in the CIL, we have to check
- * the recorded commit sequence number.
+ * Move dead percpu state to the relevant CIL context structures.
*
- * Note: for this to be used in a non-racy manner, it has to be called with
- * CIL flushing locked out. As a result, it should only be used during the
- * transaction commit process when deciding what to format into the item.
+ * We have to lock the CIL context here to ensure that nothing is modifying
+ * the percpu state, either addition or removal. Both of these are done under
+ * the CIL context lock, so grabbing that exclusively here will ensure we can
+ * safely drain the cilpcp for the CPU that is dying.
*/
-bool
-xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
+void
+xlog_cil_pcp_dead(
+ struct xlog *log,
+ unsigned int cpu)
{
- struct xfs_cil_ctx *ctx;
-
- if (list_empty(&lip->li_cil))
- return false;
-
- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+ struct xfs_cil *cil = log->l_cilp;
+ struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ struct xfs_cil_ctx *ctx;
- /*
- * li_seq is written on the first commit of a log item to record the
- * first checkpoint it is written to. Hence if it is different to the
- * current sequence, we're in a new checkpoint.
- */
- if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
- return false;
- return true;
+ down_write(&cil->xc_ctx_lock);
+ ctx = cil->xc_ctx;
+ if (ctx->ticket)
+ ctx->ticket->t_curr_res += cilpcp->space_reserved;
+ cilpcp->space_reserved = 0;
+
+ if (!list_empty(&cilpcp->log_items))
+ list_splice_init(&cilpcp->log_items, &ctx->log_items);
+ if (!list_empty(&cilpcp->busy_extents))
+ list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
+ atomic_add(cilpcp->space_used, &ctx->space_used);
+ cilpcp->space_used = 0;
+ up_write(&cil->xc_ctx_lock);
}
/*
@@ -1174,52 +1827,71 @@ xfs_log_item_in_current_chkpt(
*/
int
xlog_cil_init(
- struct xlog *log)
+ struct xlog *log)
{
- struct xfs_cil *cil;
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
if (!cil)
return -ENOMEM;
+ /*
+ * Limit the CIL pipeline depth to 4 concurrent works to bound the
+ * concurrency the log spinlocks will be exposed to.
+ */
+ cil->xc_push_wq = alloc_workqueue("xfs-cil/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
+ 4, log->l_mp->m_super->s_id);
+ if (!cil->xc_push_wq)
+ goto out_destroy_cil;
- ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
- if (!ctx) {
- kmem_free(cil);
- return -ENOMEM;
+ cil->xc_log = log;
+ cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp);
+ if (!cil->xc_pcp)
+ goto out_destroy_wq;
+
+ for_each_possible_cpu(cpu) {
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ INIT_LIST_HEAD(&cilpcp->busy_extents);
+ INIT_LIST_HEAD(&cilpcp->log_items);
}
- INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
- INIT_LIST_HEAD(&cil->xc_cil);
INIT_LIST_HEAD(&cil->xc_committing);
- spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
+ init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
+ init_waitqueue_head(&cil->xc_start_wait);
init_waitqueue_head(&cil->xc_commit_wait);
-
- INIT_LIST_HEAD(&ctx->committing);
- INIT_LIST_HEAD(&ctx->busy_extents);
- ctx->sequence = 1;
- ctx->cil = cil;
- cil->xc_ctx = ctx;
- cil->xc_current_sequence = ctx->sequence;
-
- cil->xc_log = log;
log->l_cilp = cil;
+
+ ctx = xlog_cil_ctx_alloc();
+ xlog_cil_ctx_switch(cil, ctx);
return 0;
+
+out_destroy_wq:
+ destroy_workqueue(cil->xc_push_wq);
+out_destroy_cil:
+ kmem_free(cil);
+ return -ENOMEM;
}
void
xlog_cil_destroy(
struct xlog *log)
{
- if (log->l_cilp->xc_ctx) {
- if (log->l_cilp->xc_ctx->ticket)
- xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
- kmem_free(log->l_cilp->xc_ctx);
+ struct xfs_cil *cil = log->l_cilp;
+
+ if (cil->xc_ctx) {
+ if (cil->xc_ctx->ticket)
+ xfs_log_ticket_put(cil->xc_ctx->ticket);
+ kmem_free(cil->xc_ctx);
}
- ASSERT(list_empty(&log->l_cilp->xc_cil));
- kmem_free(log->l_cilp);
+ ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
+ free_percpu(cil->xc_pcp);
+ destroy_workqueue(cil->xc_push_wq);
+ kmem_free(cil);
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b192c5a9f9fd..1bd2963e8fbd 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -12,15 +12,6 @@ struct xlog_ticket;
struct xfs_mount;
/*
- * Flags for log structure
- */
-#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
-#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
-#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
- shutdown */
-#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
-
-/*
* get client id from packed copy.
*
* this hack is here because the xlog_pack code copies four bytes
@@ -47,17 +38,33 @@ enum xlog_iclog_state {
XLOG_STATE_DONE_SYNC, /* Done syncing to disk */
XLOG_STATE_CALLBACK, /* Callback functions now */
XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */
- XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
};
+#define XLOG_STATE_STRINGS \
+ { XLOG_STATE_ACTIVE, "XLOG_STATE_ACTIVE" }, \
+ { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \
+ { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \
+ { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \
+ { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \
+ { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }
+
+/*
+ * In core log flags
+ */
+#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */
+
+#define XLOG_ICL_STRINGS \
+ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
+ { XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" }
+
+
/*
- * Flags to log ticket
+ * Log ticket flags
*/
-#define XLOG_TIC_INITED 0x1 /* has been initialized */
-#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */
#define XLOG_TIC_FLAGS \
- { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
@@ -135,37 +142,17 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-/* Ticket reservation region accounting */
-#define XLOG_TIC_LEN_MAX 15
-
-/*
- * Reservation region
- * As would be stored in xfs_log_iovec but without the i_addr which
- * we don't care about.
- */
-typedef struct xlog_res {
- uint r_len; /* region length :4 */
- uint r_type; /* region's transaction type :4 */
-} xlog_res_t;
-
typedef struct xlog_ticket {
- struct list_head t_queue; /* reserve/write queue */
- struct task_struct *t_task; /* task that owns this ticket */
- xlog_tid_t t_tid; /* transaction identifier : 4 */
- atomic_t t_ref; /* ticket reference count : 4 */
- int t_curr_res; /* current reservation in bytes : 4 */
- int t_unit_res; /* unit reservation in bytes : 4 */
- char t_ocnt; /* original count : 1 */
- char t_cnt; /* current count : 1 */
- char t_clientid; /* who does this belong to; : 1 */
- char t_flags; /* properties of reservation : 1 */
-
- /* reservation array fields */
- uint t_res_num; /* num in array : 4 */
- uint t_res_num_ophdrs; /* num op hdrs : 4 */
- uint t_res_arr_sum; /* array sum : 4 */
- uint t_res_o_flow; /* sum overflow : 4 */
- xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
+ struct list_head t_queue; /* reserve/write queue */
+ struct task_struct *t_task; /* task that owns this ticket */
+ xlog_tid_t t_tid; /* transaction identifier */
+ atomic_t t_ref; /* ticket reference count */
+ int t_curr_res; /* current reservation */
+ int t_unit_res; /* unit reservation */
+ char t_ocnt; /* original unit count */
+ char t_cnt; /* current unit count */
+ uint8_t t_flags; /* properties of reservation */
+ int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
} xlog_ticket_t;
/*
@@ -203,10 +190,8 @@ typedef struct xlog_in_core {
u32 ic_size;
u32 ic_offset;
enum xlog_iclog_state ic_state;
- char *ic_datap; /* pointer to iclog data */
-
- /* Callback structures need their own cacheline */
- spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
+ unsigned int ic_flags;
+ void *ic_datap; /* pointer to iclog data */
struct list_head ic_callbacks;
/* reference counts need their own cacheline */
@@ -232,17 +217,30 @@ struct xfs_cil;
struct xfs_cil_ctx {
struct xfs_cil *cil;
- xfs_lsn_t sequence; /* chkpt sequence # */
+ xfs_csn_t sequence; /* chkpt sequence # */
xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
+ struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
- int nvecs; /* number of regions */
- int space_used; /* aggregate size of regions */
+ atomic_t space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
- struct xfs_log_vec *lv_chain; /* logvecs being pushed */
+ struct list_head log_items; /* log items in chkpt */
+ struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
+ struct work_struct push_work;
+ atomic_t order_id;
+};
+
+/*
+ * Per-cpu CIL tracking items
+ */
+struct xlog_cil_pcp {
+ int32_t space_used;
+ uint32_t space_reserved;
+ struct list_head busy_extents;
+ struct list_head log_items;
};
/*
@@ -263,20 +261,32 @@ struct xfs_cil_ctx {
*/
struct xfs_cil {
struct xlog *xc_log;
- struct list_head xc_cil;
- spinlock_t xc_cil_lock;
+ unsigned long xc_flags;
+ atomic_t xc_iclog_hdrs;
+ struct workqueue_struct *xc_push_wq;
struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
struct xfs_cil_ctx *xc_ctx;
spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
- xfs_lsn_t xc_push_seq;
+ xfs_csn_t xc_push_seq;
+ bool xc_push_commit_stable;
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
- xfs_lsn_t xc_current_sequence;
- struct work_struct xc_push_work;
+ wait_queue_head_t xc_start_wait;
+ xfs_csn_t xc_current_sequence;
+ wait_queue_head_t xc_push_wait; /* background push throttle */
+
+ void __percpu *xc_pcp; /* percpu CIL structures */
+#ifdef CONFIG_HOTPLUG_CPU
+ struct list_head xc_pcp_list;
+#endif
} ____cacheline_aligned_in_smp;
+/* xc_flags bit values */
+#define XLOG_CIL_EMPTY 1
+#define XLOG_CIL_PCP_SPACE 2
+
/*
* The amount of log space we allow the CIL to aggregate is difficult to size.
* Whatever we choose, we have to make sure we can get a reservation for the
@@ -318,13 +328,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits. A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ * made without excessive blocking of incoming transaction commits. This is
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
+ * AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
+ * buffer window (32MB) as measurements have shown this to be roughly the
+ * point of diminishing performance increases under highly concurrent
+ * modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
*/
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
@@ -350,7 +400,7 @@ struct xlog {
struct xfs_buftarg *l_targ; /* buftarg of log */
struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */
struct delayed_work l_work; /* background flush work */
- uint l_flags;
+ long l_opstate; /* operational state */
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
int l_iclog_hsize; /* size of iclog header */
@@ -391,18 +441,52 @@ struct xlog {
struct xfs_kobj l_kobj;
- /* The following field are used for debugging; need to hold icloglock */
-#ifdef DEBUG
- void *l_iclog_bak[XLOG_MAX_ICLOGS];
-#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
+
+ uint32_t l_iclog_roundoff;/* padding roundoff */
+
+ /* Users of log incompat features should take a read lock. */
+ struct rw_semaphore l_incompat_users;
};
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
- ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+/*
+ * Bits for operational state
+ */
+#define XLOG_ACTIVE_RECOVERY 0 /* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED 1 /* log was recovered */
+#define XLOG_IO_ERROR 2 /* log hit an I/O error, and being
+ shutdown */
+#define XLOG_TAIL_WARN 3 /* log tail verify warning issued */
-#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
+static inline bool
+xlog_recovery_needed(struct xlog *log)
+{
+ return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+}
+
+static inline bool
+xlog_in_recovery(struct xlog *log)
+{
+ return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
+}
+
+static inline bool
+xlog_is_shutdown(struct xlog *log)
+{
+ return test_bit(XLOG_IO_ERROR, &log->l_opstate);
+}
+
+/*
+ * Wait until the xlog_force_shutdown() has marked the log as shut down
+ * so xlog_is_shutdown() will always return true.
+ */
+static inline void
+xlog_shutdown_wait(
+ struct xlog *log)
+{
+ wait_var_event(&log->l_opstate, xlog_is_shutdown(log));
+}
/* common routines */
extern int
@@ -417,35 +501,22 @@ xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
-extern kmem_zone_t *xfs_log_ticket_zone;
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int count,
- char client,
- bool permanent,
- xfs_km_flags_t alloc_flags);
-
-
-static inline void
-xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
-{
- *ptr += bytes;
- *len -= bytes;
- *off += bytes;
-}
+extern struct kmem_cache *xfs_log_ticket_cache;
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+ int count, bool permanent);
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
-int
-xlog_write(
- struct xlog *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
- uint flags);
+int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
+ struct list_head *lv_chain, struct xlog_ticket *tic,
+ uint32_t len);
+void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
+
+void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
+ int eventual_size);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
@@ -509,28 +580,25 @@ int xlog_cil_init(struct xlog *log);
void xlog_cil_init_post_recovery(struct xlog *log);
void xlog_cil_destroy(struct xlog *log);
bool xlog_cil_empty(struct xlog *log);
+void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
+ xfs_csn_t *commit_seq, bool regrant);
+void xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx,
+ struct xlog_in_core *iclog);
+
/*
* CIL force routines
*/
-xfs_lsn_t
-xlog_cil_force_lsn(
- struct xlog *log,
- xfs_lsn_t sequence);
+void xlog_cil_flush(struct xlog *log);
+xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
static inline void
xlog_cil_force(struct xlog *log)
{
- xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+ xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
}
/*
- * Unmount record type is used as a pseudo transaction type for the ticket.
- * It's value must be outside the range of XFS_TRANS_* values.
- */
-#define XLOG_UNMOUNT_REC_TYPE (-1U)
-
-/*
* Wrapper function for waiting on a wait queue serialised against wakeups
* by a spinlock. This matches the semantics of all the wait queues used in the
* log code.
@@ -550,6 +618,8 @@ xlog_wait(
remove_wait_queue(wq, &wait);
}
+int xlog_wait_on_iclog(struct xlog_in_core *iclog);
+
/*
* The LSN is valid so long as it is behind the current LSN. If it isn't, this
* means that the next log record that includes this metadata could have a
@@ -601,4 +671,43 @@ xlog_valid_lsn(
return valid;
}
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+ size_t buf_size)
+{
+ gfp_t flags = GFP_KERNEL;
+ void *p;
+
+ flags &= ~__GFP_DIRECT_RECLAIM;
+ flags |= __GFP_NOWARN | __GFP_NORETRY;
+ do {
+ p = kmalloc(buf_size, flags);
+ if (!p)
+ p = vmalloc(buf_size);
+ } while (!p);
+
+ return p;
+}
+
+/*
+ * CIL CPU dead notifier
+ */
+void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 25cfc85dbaa7..322eb2ee6c55 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -18,21 +18,16 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
-#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_bmap_btree.h"
#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_rmap_item.h"
#include "xfs_buf_item.h"
-#include "xfs_refcount_item.h"
-#include "xfs_bmap_item.h"
+#include "xfs_ag.h"
+#include "xfs_quota.h"
+#include "xfs_reflink.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -44,29 +39,11 @@ STATIC int
xlog_clear_stale_blocks(
struct xlog *,
xfs_lsn_t);
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_summary(
- struct xlog *);
-#else
-#define xlog_recover_check_summary(log)
-#endif
STATIC int
xlog_do_recovery_pass(
struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
/*
- * This structure is used during recovery to record the buf log items which
- * have been canceled and should not be replayed.
- */
-struct xfs_buf_cancel {
- xfs_daddr_t bc_blkno;
- uint bc_len;
- int bc_refcount;
- struct list_head bc_list;
-};
-
-/*
* Sector aligned buffer routines for buffer create/read/write/access
*/
@@ -97,8 +74,6 @@ xlog_alloc_buffer(
struct xlog *log,
int nbblks)
{
- int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
-
/*
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
@@ -126,7 +101,7 @@ xlog_alloc_buffer(
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
+ return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
}
/*
@@ -147,7 +122,7 @@ xlog_do_io(
xfs_daddr_t blk_no,
unsigned int nbblks,
char *data,
- unsigned int op)
+ enum req_op op)
{
int error;
@@ -164,7 +139,7 @@ xlog_do_io(
error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
BBTOB(nbblks), data, op);
- if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ if (error && !xlog_is_shutdown(log)) {
xfs_alert(log->l_mp,
"log recovery %s I/O error at daddr 0x%llx len %d error %d",
op == REQ_OP_WRITE ? "write" : "read",
@@ -284,33 +259,6 @@ xlog_header_check_mount(
return 0;
}
-STATIC void
-xlog_recover_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- /*
- * We're not going to bother about retrying
- * this during recovery. One strike!
- */
- if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror_alert(bp, __this_address);
- xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
- }
- }
-
- /*
- * On v5 supers, a bli could be attached to update the metadata LSN.
- * Clean it up.
- */
- if (bp->b_log_item)
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
-
- bp->b_iodone = NULL;
- xfs_buf_ioend(bp);
-}
-
/*
* This routine finds (to an approximation) the first block in the physical
* log which contains the given cycle. It uses a binary search algorithm.
@@ -417,6 +365,19 @@ out:
return error;
}
+static inline int
+xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
+{
+ if (xfs_has_logv2(log->l_mp)) {
+ int h_size = be32_to_cpu(rh->h_size);
+
+ if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
+ h_size > XLOG_HEADER_CYCLE_SIZE)
+ return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+ }
+ return 1;
+}
+
/*
* Potentially backup over partial log record write.
*
@@ -509,15 +470,7 @@ xlog_find_verify_log_record(
* reset last_blk. Only when last_blk points in the middle of a log
* record do we update last_blk.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- uint h_size = be32_to_cpu(head->h_size);
-
- xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- } else {
- xhdrs = 1;
- }
+ xhdrs = xlog_logrec_hblks(log, head);
if (*last_blk - i + extra_bblks !=
BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
@@ -1120,7 +1073,7 @@ xlog_verify_head(
*
* Note that xlog_find_tail() clears the blocks at the new head
* (i.e., the records with invalid CRC) if the cycle number
- * matches the the current cycle.
+ * matches the current cycle.
*/
found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
buffer, rhead_blk, rhead, wrapped);
@@ -1204,22 +1157,7 @@ xlog_check_unmount_rec(
* below. We won't want to clear the unmount record if there is one, so
* we pass the lsn of the unmount record rather than the block after it.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- int h_size = be32_to_cpu(rhead->h_size);
- int h_version = be32_to_cpu(rhead->h_version);
-
- if ((h_version & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
- } else {
- hblks = 1;
- }
- } else {
- hblks = 1;
- }
-
+ hblks = xlog_logrec_hblks(log, rhead);
after_umount_blk = xlog_wrap_logbno(log,
rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
@@ -1402,7 +1340,7 @@ xlog_find_tail(
* headers if we have a filesystem using non-persistent counters.
*/
if (clean)
- log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+ set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
/*
* Make sure that there are no blocks in front of the head
@@ -1559,7 +1497,7 @@ xlog_add_record(
recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
recp->h_cycle = cpu_to_be32(cycle);
recp->h_version = cpu_to_be32(
- xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+ xfs_has_logv2(log->l_mp) ? 2 : 1);
recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
recp->h_fmt = cpu_to_be32(XLOG_FMT);
@@ -1779,12 +1717,98 @@ xlog_clear_stale_blocks(
return 0;
}
+/*
+ * Release the recovered intent item in the AIL that matches the given intent
+ * type and intent id.
+ */
+void
+xlog_recover_release_intent(
+ struct xlog *log,
+ unsigned short intent_type,
+ uint64_t intent_id)
+{
+ struct xfs_ail_cursor cur;
+ struct xfs_log_item *lip;
+ struct xfs_ail *ailp = log->l_ailp;
+
+ spin_lock(&ailp->ail_lock);
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ if (lip->li_type != intent_type)
+ continue;
+ if (!lip->li_ops->iop_match(lip, intent_id))
+ continue;
+
+ spin_unlock(&ailp->ail_lock);
+ lip->li_ops->iop_release(lip);
+ spin_lock(&ailp->ail_lock);
+ break;
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->ail_lock);
+}
+
+int
+xlog_recover_iget(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp)
+{
+ int error;
+
+ error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+ if (error)
+ return error;
+
+ error = xfs_qm_dqattach(*ipp);
+ if (error) {
+ xfs_irele(*ipp);
+ return error;
+ }
+
+ if (VFS_I(*ipp)->i_nlink == 0)
+ xfs_iflags_set(*ipp, XFS_IRECOVERY);
+
+ return 0;
+}
+
/******************************************************************************
*
* Log recover routines
*
******************************************************************************
*/
+static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
+ &xlog_buf_item_ops,
+ &xlog_inode_item_ops,
+ &xlog_dquot_item_ops,
+ &xlog_quotaoff_item_ops,
+ &xlog_icreate_item_ops,
+ &xlog_efi_item_ops,
+ &xlog_efd_item_ops,
+ &xlog_rui_item_ops,
+ &xlog_rud_item_ops,
+ &xlog_cui_item_ops,
+ &xlog_cud_item_ops,
+ &xlog_bui_item_ops,
+ &xlog_bud_item_ops,
+ &xlog_attri_item_ops,
+ &xlog_attrd_item_ops,
+};
+
+static const struct xlog_recover_item_ops *
+xlog_find_item_ops(
+ struct xlog_recover_item *item)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
+ if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
+ return xlog_recover_item_ops[i];
+
+ return NULL;
+}
/*
* Sort the log items in the transaction.
@@ -1841,54 +1865,23 @@ xlog_recover_reorder_trans(
struct xlog_recover *trans,
int pass)
{
- xlog_recover_item_t *item, *n;
+ struct xlog_recover_item *item, *n;
int error = 0;
LIST_HEAD(sort_list);
LIST_HEAD(cancel_list);
LIST_HEAD(buffer_list);
LIST_HEAD(inode_buffer_list);
- LIST_HEAD(inode_list);
+ LIST_HEAD(item_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+ enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
- switch (ITEM_TYPE(item)) {
- case XFS_LI_ICREATE:
- list_move_tail(&item->ri_list, &buffer_list);
- break;
- case XFS_LI_BUF:
- if (buf_f->blf_flags & XFS_BLF_CANCEL) {
- trace_xfs_log_recover_item_reorder_head(log,
- trans, item, pass);
- list_move(&item->ri_list, &cancel_list);
- break;
- }
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
- list_move(&item->ri_list, &inode_buffer_list);
- break;
- }
- list_move_tail(&item->ri_list, &buffer_list);
- break;
- case XFS_LI_INODE:
- case XFS_LI_DQUOT:
- case XFS_LI_QUOTAOFF:
- case XFS_LI_EFD:
- case XFS_LI_EFI:
- case XFS_LI_RUI:
- case XFS_LI_RUD:
- case XFS_LI_CUI:
- case XFS_LI_CUD:
- case XFS_LI_BUI:
- case XFS_LI_BUD:
- trace_xfs_log_recover_item_reorder_tail(log,
- trans, item, pass);
- list_move_tail(&item->ri_list, &inode_list);
- break;
- default:
+ item->ri_ops = xlog_find_item_ops(item);
+ if (!item->ri_ops) {
xfs_warn(log->l_mp,
- "%s: unrecognized type of log operation",
- __func__);
+ "%s: unrecognized type of log operation (%d)",
+ __func__, ITEM_TYPE(item));
ASSERT(0);
/*
* return the remaining items back to the transaction
@@ -1896,16 +1889,38 @@ xlog_recover_reorder_trans(
*/
if (!list_empty(&sort_list))
list_splice_init(&sort_list, &trans->r_itemq);
- error = -EIO;
- goto out;
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ if (item->ri_ops->reorder)
+ fate = item->ri_ops->reorder(item);
+
+ switch (fate) {
+ case XLOG_REORDER_BUFFER_LIST:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
+ case XLOG_REORDER_CANCEL_LIST:
+ trace_xfs_log_recover_item_reorder_head(log,
+ trans, item, pass);
+ list_move(&item->ri_list, &cancel_list);
+ break;
+ case XLOG_REORDER_INODE_BUFFER_LIST:
+ list_move(&item->ri_list, &inode_buffer_list);
+ break;
+ case XLOG_REORDER_ITEM_LIST:
+ trace_xfs_log_recover_item_reorder_tail(log,
+ trans, item, pass);
+ list_move_tail(&item->ri_list, &item_list);
+ break;
}
}
-out:
+
ASSERT(list_empty(&sort_list));
if (!list_empty(&buffer_list))
list_splice(&buffer_list, &trans->r_itemq);
- if (!list_empty(&inode_list))
- list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&item_list))
+ list_splice_tail(&item_list, &trans->r_itemq);
if (!list_empty(&inode_buffer_list))
list_splice_tail(&inode_buffer_list, &trans->r_itemq);
if (!list_empty(&cancel_list))
@@ -1913,2152 +1928,15 @@ out:
return error;
}
-/*
- * Build up the table of buf cancel records so that we don't replay
- * cancelled data in the second pass. For buffer records that are
- * not cancel records, there is nothing to do here so we just return.
- *
- * If we get a cancel record which is already in the table, this indicates
- * that the buffer was cancelled multiple times. In order to ensure
- * that during pass 2 we keep the record in the table until we reach its
- * last occurrence in the log, we keep a reference count in the cancel
- * record in the table to tell us how many times we expect to see this
- * record during the second pass.
- */
-STATIC int
-xlog_recover_buffer_pass1(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
- struct list_head *bucket;
- struct xfs_buf_cancel *bcp;
-
- if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
- xfs_err(log->l_mp, "bad buffer log item size (%d)",
- item->ri_buf[0].i_len);
- return -EFSCORRUPTED;
- }
-
- /*
- * If this isn't a cancel buffer item, then just return.
- */
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
- trace_xfs_log_recover_buf_not_cancel(log, buf_f);
- return 0;
- }
-
- /*
- * Insert an xfs_buf_cancel record into the hash table of them.
- * If there is already an identical record, bump its reference count.
- */
- bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
- list_for_each_entry(bcp, bucket, bc_list) {
- if (bcp->bc_blkno == buf_f->blf_blkno &&
- bcp->bc_len == buf_f->blf_len) {
- bcp->bc_refcount++;
- trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
- return 0;
- }
- }
-
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
- bcp->bc_blkno = buf_f->blf_blkno;
- bcp->bc_len = buf_f->blf_len;
- bcp->bc_refcount = 1;
- list_add_tail(&bcp->bc_list, bucket);
-
- trace_xfs_log_recover_buf_cancel_add(log, buf_f);
- return 0;
-}
-
-/*
- * Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table. If it is, return the cancel
- * buffer structure to the caller.
- */
-STATIC struct xfs_buf_cancel *
-xlog_peek_buffer_cancelled(
- struct xlog *log,
- xfs_daddr_t blkno,
- uint len,
- unsigned short flags)
-{
- struct list_head *bucket;
- struct xfs_buf_cancel *bcp;
-
- if (!log->l_buf_cancel_table) {
- /* empty table means no cancelled buffers in the log */
- ASSERT(!(flags & XFS_BLF_CANCEL));
- return NULL;
- }
-
- bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
- list_for_each_entry(bcp, bucket, bc_list) {
- if (bcp->bc_blkno == blkno && bcp->bc_len == len)
- return bcp;
- }
-
- /*
- * We didn't find a corresponding entry in the table, so return 0 so
- * that the buffer is NOT cancelled.
- */
- ASSERT(!(flags & XFS_BLF_CANCEL));
- return NULL;
-}
-
-/*
- * If the buffer is being cancelled then return 1 so that it will be cancelled,
- * otherwise return 0. If the buffer is actually a buffer cancel item
- * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
- * table and remove it from the table if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its last
- * occurrence in the log so that if the same buffer is re-used again after its
- * last cancellation we actually replay the changes made at that point.
- */
-STATIC int
-xlog_check_buffer_cancelled(
+void
+xlog_buf_readahead(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
- unsigned short flags)
-{
- struct xfs_buf_cancel *bcp;
-
- bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
- if (!bcp)
- return 0;
-
- /*
- * We've go a match, so return 1 so that the recovery of this buffer
- * is cancelled. If this buffer is actually a buffer cancel log
- * item, then decrement the refcount on the one in the table and
- * remove it if this is the last reference.
- */
- if (flags & XFS_BLF_CANCEL) {
- if (--bcp->bc_refcount == 0) {
- list_del(&bcp->bc_list);
- kmem_free(bcp);
- }
- }
- return 1;
-}
-
-/*
- * Perform recovery for a buffer full of inodes. In these buffers, the only
- * data which should be recovered is that which corresponds to the
- * di_next_unlinked pointers in the on disk inode structures. The rest of the
- * data for the inodes is always logged through the inodes themselves rather
- * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- *
- * The only time when buffers full of inodes are fully recovered is when the
- * buffer is full of newly allocated inodes. In this case the buffer will
- * not be marked as an inode buffer and so will be sent to
- * xlog_recover_do_reg_buffer() below during recovery.
- */
-STATIC int
-xlog_recover_do_inode_buffer(
- struct xfs_mount *mp,
- xlog_recover_item_t *item,
- struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f)
-{
- int i;
- int item_index = 0;
- int bit = 0;
- int nbits = 0;
- int reg_buf_offset = 0;
- int reg_buf_bytes = 0;
- int next_unlinked_offset;
- int inodes_per_buf;
- xfs_agino_t *logged_nextp;
- xfs_agino_t *buffer_nextp;
-
- trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-
- /*
- * Post recovery validation only works properly on CRC enabled
- * filesystems.
- */
- if (xfs_sb_version_hascrc(&mp->m_sb))
- bp->b_ops = &xfs_inode_buf_ops;
-
- inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
- for (i = 0; i < inodes_per_buf; i++) {
- next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- while (next_unlinked_offset >=
- (reg_buf_offset + reg_buf_bytes)) {
- /*
- * The next di_next_unlinked field is beyond
- * the current logged region. Find the next
- * logged region that contains or is beyond
- * the current di_next_unlinked field.
- */
- bit += nbits;
- bit = xfs_next_bit(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
-
- /*
- * If there are no more logged regions in the
- * buffer, then we're done.
- */
- if (bit == -1)
- return 0;
-
- nbits = xfs_contig_bits(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
- ASSERT(nbits > 0);
- reg_buf_offset = bit << XFS_BLF_SHIFT;
- reg_buf_bytes = nbits << XFS_BLF_SHIFT;
- item_index++;
- }
-
- /*
- * If the current logged region starts after the current
- * di_next_unlinked field, then move on to the next
- * di_next_unlinked field.
- */
- if (next_unlinked_offset < reg_buf_offset)
- continue;
-
- ASSERT(item->ri_buf[item_index].i_addr != NULL);
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
-
- /*
- * The current logged region contains a copy of the
- * current di_next_unlinked field. Extract its value
- * and copy it to the buffer copy.
- */
- logged_nextp = item->ri_buf[item_index].i_addr +
- next_unlinked_offset - reg_buf_offset;
- if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
- xfs_alert(mp,
- "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
- "Trying to replay bad (0) inode di_next_unlinked field.",
- item, bp);
- return -EFSCORRUPTED;
- }
-
- buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
- *buffer_nextp = *logged_nextp;
-
- /*
- * If necessary, recalculate the CRC in the on-disk inode. We
- * have to leave the inode in a consistent state for whoever
- * reads it next....
- */
- xfs_dinode_calc_crc(mp,
- xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
-
- }
-
- return 0;
-}
-
-/*
- * V5 filesystems know the age of the buffer on disk being recovered. We can
- * have newer objects on disk than we are replaying, and so for these cases we
- * don't want to replay the current change as that will make the buffer contents
- * temporarily invalid on disk.
- *
- * The magic number might not match the buffer type we are going to recover
- * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
- * extract the LSN of the existing object in the buffer based on it's current
- * magic number. If we don't recognise the magic number in the buffer, then
- * return a LSN of -1 so that the caller knows it was an unrecognised block and
- * so can recover the buffer.
- *
- * Note: we cannot rely solely on magic number matches to determine that the
- * buffer has a valid LSN - we also need to verify that it belongs to this
- * filesystem, so we need to extract the object's LSN and compare it to that
- * which we read from the superblock. If the UUIDs don't match, then we've got a
- * stale metadata block from an old filesystem instance that we need to recover
- * over the top of.
- */
-static xfs_lsn_t
-xlog_recover_get_buf_lsn(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- uint32_t magic32;
- uint16_t magic16;
- uint16_t magicda;
- void *blk = bp->b_addr;
- uuid_t *uuid;
- xfs_lsn_t lsn = -1;
-
- /* v4 filesystems always recover immediately */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- goto recover_immediately;
-
- magic32 = be32_to_cpu(*(__be32 *)blk);
- switch (magic32) {
- case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
- case XFS_ABTB_MAGIC:
- case XFS_ABTC_MAGIC:
- case XFS_RMAP_CRC_MAGIC:
- case XFS_REFC_CRC_MAGIC:
- case XFS_IBT_CRC_MAGIC:
- case XFS_IBT_MAGIC: {
- struct xfs_btree_block *btb = blk;
-
- lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
- uuid = &btb->bb_u.s.bb_uuid;
- break;
- }
- case XFS_BMAP_CRC_MAGIC:
- case XFS_BMAP_MAGIC: {
- struct xfs_btree_block *btb = blk;
-
- lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
- uuid = &btb->bb_u.l.bb_uuid;
- break;
- }
- case XFS_AGF_MAGIC:
- lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
- uuid = &((struct xfs_agf *)blk)->agf_uuid;
- break;
- case XFS_AGFL_MAGIC:
- lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
- uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
- break;
- case XFS_AGI_MAGIC:
- lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
- uuid = &((struct xfs_agi *)blk)->agi_uuid;
- break;
- case XFS_SYMLINK_MAGIC:
- lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
- uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
- break;
- case XFS_DIR3_BLOCK_MAGIC:
- case XFS_DIR3_DATA_MAGIC:
- case XFS_DIR3_FREE_MAGIC:
- lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
- uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
- break;
- case XFS_ATTR3_RMT_MAGIC:
- /*
- * Remote attr blocks are written synchronously, rather than
- * being logged. That means they do not contain a valid LSN
- * (i.e. transactionally ordered) in them, and hence any time we
- * see a buffer to replay over the top of a remote attribute
- * block we should simply do so.
- */
- goto recover_immediately;
- case XFS_SB_MAGIC:
- /*
- * superblock uuids are magic. We may or may not have a
- * sb_meta_uuid on disk, but it will be set in the in-core
- * superblock. We set the uuid pointer for verification
- * according to the superblock feature mask to ensure we check
- * the relevant UUID in the superblock.
- */
- lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
- if (xfs_sb_version_hasmetauuid(&mp->m_sb))
- uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
- else
- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
- break;
- default:
- break;
- }
-
- if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
- goto recover_immediately;
- return lsn;
- }
-
- magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
- switch (magicda) {
- case XFS_DIR3_LEAF1_MAGIC:
- case XFS_DIR3_LEAFN_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
- uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
- break;
- default:
- break;
- }
-
- if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
- goto recover_immediately;
- return lsn;
- }
-
- /*
- * We do individual object checks on dquot and inode buffers as they
- * have their own individual LSN records. Also, we could have a stale
- * buffer here, so we have to at least recognise these buffer types.
- *
- * A notd complexity here is inode unlinked list processing - it logs
- * the inode directly in the buffer, but we don't know which inodes have
- * been modified, and there is no global buffer LSN. Hence we need to
- * recover all inode buffer types immediately. This problem will be
- * fixed by logical logging of the unlinked list modifications.
- */
- magic16 = be16_to_cpu(*(__be16 *)blk);
- switch (magic16) {
- case XFS_DQUOT_MAGIC:
- case XFS_DINODE_MAGIC:
- goto recover_immediately;
- default:
- break;
- }
-
- /* unknown buffer contents, recover immediately */
-
-recover_immediately:
- return (xfs_lsn_t)-1;
-
-}
-
-/*
- * Validate the recovered buffer is of the correct type and attach the
- * appropriate buffer operations to them for writeback. Magic numbers are in a
- * few places:
- * the first 16 bits of the buffer (inode buffer, dquot buffer),
- * the first 32 bits of the buffer (most blocks),
- * inside a struct xfs_da_blkinfo at the start of the buffer.
- */
-static void
-xlog_recover_validate_buf_type(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f,
- xfs_lsn_t current_lsn)
-{
- struct xfs_da_blkinfo *info = bp->b_addr;
- uint32_t magic32;
- uint16_t magic16;
- uint16_t magicda;
- char *warnmsg = NULL;
-
- /*
- * We can only do post recovery validation on items on CRC enabled
- * fielsystems as we need to know when the buffer was written to be able
- * to determine if we should have replayed the item. If we replay old
- * metadata over a newer buffer, then it will enter a temporarily
- * inconsistent state resulting in verification failures. Hence for now
- * just avoid the verification stage for non-crc filesystems
- */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return;
-
- magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
- magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
- magicda = be16_to_cpu(info->magic);
- switch (xfs_blft_from_flags(buf_f)) {
- case XFS_BLFT_BTREE_BUF:
- switch (magic32) {
- case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTB_MAGIC:
- bp->b_ops = &xfs_bnobt_buf_ops;
- break;
- case XFS_ABTC_CRC_MAGIC:
- case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_cntbt_buf_ops;
- break;
- case XFS_IBT_CRC_MAGIC:
- case XFS_IBT_MAGIC:
- bp->b_ops = &xfs_inobt_buf_ops;
- break;
- case XFS_FIBT_CRC_MAGIC:
- case XFS_FIBT_MAGIC:
- bp->b_ops = &xfs_finobt_buf_ops;
- break;
- case XFS_BMAP_CRC_MAGIC:
- case XFS_BMAP_MAGIC:
- bp->b_ops = &xfs_bmbt_buf_ops;
- break;
- case XFS_RMAP_CRC_MAGIC:
- bp->b_ops = &xfs_rmapbt_buf_ops;
- break;
- case XFS_REFC_CRC_MAGIC:
- bp->b_ops = &xfs_refcountbt_buf_ops;
- break;
- default:
- warnmsg = "Bad btree block magic!";
- break;
- }
- break;
- case XFS_BLFT_AGF_BUF:
- if (magic32 != XFS_AGF_MAGIC) {
- warnmsg = "Bad AGF block magic!";
- break;
- }
- bp->b_ops = &xfs_agf_buf_ops;
- break;
- case XFS_BLFT_AGFL_BUF:
- if (magic32 != XFS_AGFL_MAGIC) {
- warnmsg = "Bad AGFL block magic!";
- break;
- }
- bp->b_ops = &xfs_agfl_buf_ops;
- break;
- case XFS_BLFT_AGI_BUF:
- if (magic32 != XFS_AGI_MAGIC) {
- warnmsg = "Bad AGI block magic!";
- break;
- }
- bp->b_ops = &xfs_agi_buf_ops;
- break;
- case XFS_BLFT_UDQUOT_BUF:
- case XFS_BLFT_PDQUOT_BUF:
- case XFS_BLFT_GDQUOT_BUF:
-#ifdef CONFIG_XFS_QUOTA
- if (magic16 != XFS_DQUOT_MAGIC) {
- warnmsg = "Bad DQUOT block magic!";
- break;
- }
- bp->b_ops = &xfs_dquot_buf_ops;
-#else
- xfs_alert(mp,
- "Trying to recover dquots without QUOTA support built in!");
- ASSERT(0);
-#endif
- break;
- case XFS_BLFT_DINO_BUF:
- if (magic16 != XFS_DINODE_MAGIC) {
- warnmsg = "Bad INODE block magic!";
- break;
- }
- bp->b_ops = &xfs_inode_buf_ops;
- break;
- case XFS_BLFT_SYMLINK_BUF:
- if (magic32 != XFS_SYMLINK_MAGIC) {
- warnmsg = "Bad symlink block magic!";
- break;
- }
- bp->b_ops = &xfs_symlink_buf_ops;
- break;
- case XFS_BLFT_DIR_BLOCK_BUF:
- if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
- magic32 != XFS_DIR3_BLOCK_MAGIC) {
- warnmsg = "Bad dir block magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_block_buf_ops;
- break;
- case XFS_BLFT_DIR_DATA_BUF:
- if (magic32 != XFS_DIR2_DATA_MAGIC &&
- magic32 != XFS_DIR3_DATA_MAGIC) {
- warnmsg = "Bad dir data magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_data_buf_ops;
- break;
- case XFS_BLFT_DIR_FREE_BUF:
- if (magic32 != XFS_DIR2_FREE_MAGIC &&
- magic32 != XFS_DIR3_FREE_MAGIC) {
- warnmsg = "Bad dir3 free magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_free_buf_ops;
- break;
- case XFS_BLFT_DIR_LEAF1_BUF:
- if (magicda != XFS_DIR2_LEAF1_MAGIC &&
- magicda != XFS_DIR3_LEAF1_MAGIC) {
- warnmsg = "Bad dir leaf1 magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_leaf1_buf_ops;
- break;
- case XFS_BLFT_DIR_LEAFN_BUF:
- if (magicda != XFS_DIR2_LEAFN_MAGIC &&
- magicda != XFS_DIR3_LEAFN_MAGIC) {
- warnmsg = "Bad dir leafn magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_leafn_buf_ops;
- break;
- case XFS_BLFT_DA_NODE_BUF:
- if (magicda != XFS_DA_NODE_MAGIC &&
- magicda != XFS_DA3_NODE_MAGIC) {
- warnmsg = "Bad da node magic!";
- break;
- }
- bp->b_ops = &xfs_da3_node_buf_ops;
- break;
- case XFS_BLFT_ATTR_LEAF_BUF:
- if (magicda != XFS_ATTR_LEAF_MAGIC &&
- magicda != XFS_ATTR3_LEAF_MAGIC) {
- warnmsg = "Bad attr leaf magic!";
- break;
- }
- bp->b_ops = &xfs_attr3_leaf_buf_ops;
- break;
- case XFS_BLFT_ATTR_RMT_BUF:
- if (magic32 != XFS_ATTR3_RMT_MAGIC) {
- warnmsg = "Bad attr remote magic!";
- break;
- }
- bp->b_ops = &xfs_attr3_rmt_buf_ops;
- break;
- case XFS_BLFT_SB_BUF:
- if (magic32 != XFS_SB_MAGIC) {
- warnmsg = "Bad SB block magic!";
- break;
- }
- bp->b_ops = &xfs_sb_buf_ops;
- break;
-#ifdef CONFIG_XFS_RT
- case XFS_BLFT_RTBITMAP_BUF:
- case XFS_BLFT_RTSUMMARY_BUF:
- /* no magic numbers for verification of RT buffers */
- bp->b_ops = &xfs_rtbuf_ops;
- break;
-#endif /* CONFIG_XFS_RT */
- default:
- xfs_warn(mp, "Unknown buffer type %d!",
- xfs_blft_from_flags(buf_f));
- break;
- }
-
- /*
- * Nothing else to do in the case of a NULL current LSN as this means
- * the buffer is more recent than the change in the log and will be
- * skipped.
- */
- if (current_lsn == NULLCOMMITLSN)
- return;
-
- if (warnmsg) {
- xfs_warn(mp, warnmsg);
- ASSERT(0);
- }
-
- /*
- * We must update the metadata LSN of the buffer as it is written out to
- * ensure that older transactions never replay over this one and corrupt
- * the buffer. This can occur if log recovery is interrupted at some
- * point after the current transaction completes, at which point a
- * subsequent mount starts recovery from the beginning.
- *
- * Write verifiers update the metadata LSN from log items attached to
- * the buffer. Therefore, initialize a bli purely to carry the LSN to
- * the verifier. We'll clean it up in our ->iodone() callback.
- */
- if (bp->b_ops) {
- struct xfs_buf_log_item *bip;
-
- ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_item_init(bp, mp);
- bip = bp->b_log_item;
- bip->bli_item.li_lsn = current_lsn;
- }
-}
-
-/*
- * Perform a 'normal' buffer recovery. Each logged region of the
- * buffer should be copied over the corresponding region in the
- * given buffer. The bitmap in the buf log format structure indicates
- * where to place the logged data.
- */
-STATIC void
-xlog_recover_do_reg_buffer(
- struct xfs_mount *mp,
- xlog_recover_item_t *item,
- struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f,
- xfs_lsn_t current_lsn)
-{
- int i;
- int bit;
- int nbits;
- xfs_failaddr_t fa;
- const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
-
- trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-
- bit = 0;
- i = 1; /* 0 is the buf format structure */
- while (1) {
- bit = xfs_next_bit(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
- if (bit == -1)
- break;
- nbits = xfs_contig_bits(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
- ASSERT(nbits > 0);
- ASSERT(item->ri_buf[i].i_addr != NULL);
- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(BBTOB(bp->b_length) >=
- ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
-
- /*
- * The dirty regions logged in the buffer, even though
- * contiguous, may span multiple chunks. This is because the
- * dirty region may span a physical page boundary in a buffer
- * and hence be split into two separate vectors for writing into
- * the log. Hence we need to trim nbits back to the length of
- * the current region being copied out of the log.
- */
- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
-
- /*
- * Do a sanity check if this is a dquot buffer. Just checking
- * the first dquot in the buffer should do. XXXThis is
- * probably a good thing to do for other buf types also.
- */
- fa = NULL;
- if (buf_f->blf_flags &
- (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- if (item->ri_buf[i].i_addr == NULL) {
- xfs_alert(mp,
- "XFS: NULL dquot in %s.", __func__);
- goto next;
- }
- if (item->ri_buf[i].i_len < size_disk_dquot) {
- xfs_alert(mp,
- "XFS: dquot too small (%d) in %s.",
- item->ri_buf[i].i_len, __func__);
- goto next;
- }
- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
- -1, 0);
- if (fa) {
- xfs_alert(mp,
- "dquot corrupt at %pS trying to replay into block 0x%llx",
- fa, bp->b_bn);
- goto next;
- }
- }
-
- memcpy(xfs_buf_offset(bp,
- (uint)bit << XFS_BLF_SHIFT), /* dest */
- item->ri_buf[i].i_addr, /* source */
- nbits<<XFS_BLF_SHIFT); /* length */
- next:
- i++;
- bit += nbits;
- }
-
- /* Shouldn't be any more regions */
- ASSERT(i == item->ri_total);
-
- xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
-}
-
-/*
- * Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF log item of the same type
- * (ie. USR or GRP), then just toss this buffer away; don't recover it.
- * Else, treat it as a regular buffer and do recovery.
- *
- * Return false if the buffer was tossed and true if we recovered the buffer to
- * indicate to the caller if the buffer needs writing.
- */
-STATIC bool
-xlog_recover_do_dquot_buffer(
- struct xfs_mount *mp,
- struct xlog *log,
- struct xlog_recover_item *item,
- struct xfs_buf *bp,
- struct xfs_buf_log_format *buf_f)
-{
- uint type;
-
- trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
-
- /*
- * Filesystems are required to send in quota flags at mount time.
- */
- if (!mp->m_qflags)
- return false;
-
- type = 0;
- if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
- type |= XFS_DQ_USER;
- if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
- type |= XFS_DQ_PROJ;
- if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
- type |= XFS_DQ_GROUP;
- /*
- * This type of quotas was turned off, so ignore this buffer
- */
- if (log->l_quotaoffs_flag & type)
- return false;
-
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
- return true;
-}
-
-/*
- * This routine replays a modification made to a buffer at runtime.
- * There are actually two types of buffer, regular and inode, which
- * are handled differently. Inode buffers are handled differently
- * in that we only recover a specific set of data from them, namely
- * the inode di_next_unlinked fields. This is because all other inode
- * data is actually logged via inode records and any data we replay
- * here which overlaps that may be stale.
- *
- * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLF_CANCEL bit set to indicate that previous copies
- * of the buffer in the log should not be replayed at recovery time.
- * This is so that if the blocks covered by the buffer are reused for
- * file data before we crash we don't end up replaying old, freed
- * meta-data into a user's file.
- *
- * To handle the cancellation of buffer log items, we make two passes
- * over the log during recovery. During the first we build a table of
- * those buffers which have been cancelled, and during the second we
- * only replay those buffers which do not have corresponding cancel
- * records in the table. See xlog_recover_buffer_pass[1,2] above
- * for more details on the implementation of the table of cancel records.
- */
-STATIC int
-xlog_recover_buffer_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- struct xlog_recover_item *item,
- xfs_lsn_t current_lsn)
-{
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
- xfs_mount_t *mp = log->l_mp;
- xfs_buf_t *bp;
- int error;
- uint buf_flags;
- xfs_lsn_t lsn;
-
- /*
- * In this pass we only want to recover all the buffers which have
- * not been cancelled and are not cancellation buffers themselves.
- */
- if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
- buf_f->blf_len, buf_f->blf_flags)) {
- trace_xfs_log_recover_buf_cancel(log, buf_f);
- return 0;
- }
-
- trace_xfs_log_recover_buf_recover(log, buf_f);
-
- buf_flags = 0;
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
- buf_flags |= XBF_UNMAPPED;
-
- error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags, &bp, NULL);
- if (error)
- return error;
-
- /*
- * Recover the buffer only if we get an LSN from it and it's less than
- * the lsn of the transaction we are replaying.
- *
- * Note that we have to be extremely careful of readahead here.
- * Readahead does not attach verfiers to the buffers so if we don't
- * actually do any replay after readahead because of the LSN we found
- * in the buffer if more recent than that current transaction then we
- * need to attach the verifier directly. Failure to do so can lead to
- * future recovery actions (e.g. EFI and unlinked list recovery) can
- * operate on the buffers and they won't get the verifier attached. This
- * can lead to blocks on disk having the correct content but a stale
- * CRC.
- *
- * It is safe to assume these clean buffers are currently up to date.
- * If the buffer is dirtied by a later transaction being replayed, then
- * the verifier will be reset to match whatever recover turns that
- * buffer into.
- */
- lsn = xlog_recover_get_buf_lsn(mp, bp);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- trace_xfs_log_recover_buf_skip(log, buf_f);
- xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
- goto out_release;
- }
-
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
- error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
- if (error)
- goto out_release;
- } else if (buf_f->blf_flags &
- (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- bool dirty;
-
- dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
- if (!dirty)
- goto out_release;
- } else {
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
- }
-
- /*
- * Perform delayed write on the buffer. Asynchronous writes will be
- * slower when taking into account all the buffers to be flushed.
- *
- * Also make sure that only inode buffers with good sizes stay in
- * the buffer cache. The kernel moves inodes in buffers of 1 block
- * or inode_cluster_size bytes, whichever is bigger. The inode
- * buffers in the log can be a different size if the log was generated
- * by an older kernel using unclustered inode buffers or a newer kernel
- * running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't max(blocksize, inode_cluster_size)
- * for *our* value of inode_cluster_size, then we need to keep
- * the buffer out of the buffer cache so that the buffer won't
- * overlap with future reads of those inodes.
- */
- if (XFS_DINODE_MAGIC ==
- be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
- xfs_buf_stale(bp);
- error = xfs_bwrite(bp);
- } else {
- ASSERT(bp->b_mount == mp);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp, buffer_list);
- }
-
-out_release:
- xfs_buf_relse(bp);
- return error;
-}
-
-/*
- * Inode fork owner changes
- *
- * If we have been told that we have to reparent the inode fork, it's because an
- * extent swap operation on a CRC enabled filesystem has been done and we are
- * replaying it. We need to walk the BMBT of the appropriate fork and change the
- * owners of it.
- *
- * The complexity here is that we don't have an inode context to work with, so
- * after we've replayed the inode we need to instantiate one. This is where the
- * fun begins.
- *
- * We are in the middle of log recovery, so we can't run transactions. That
- * means we cannot use cache coherent inode instantiation via xfs_iget(), as
- * that will result in the corresponding iput() running the inode through
- * xfs_inactive(). If we've just replayed an inode core that changes the link
- * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
- * transactions (bad!).
- *
- * So, to avoid this, we instantiate an inode directly from the inode core we've
- * just recovered. We have the buffer still locked, and all we really need to
- * instantiate is the inode core and the forks being modified. We can do this
- * manually, then run the inode btree owner change, and then tear down the
- * xfs_inode without having to run any transactions at all.
- *
- * Also, because we don't have a transaction context available here but need to
- * gather all the buffers we modify for writeback so we pass the buffer_list
- * instead for the operation to use.
- */
-
-STATIC int
-xfs_recover_inode_owner_change(
- struct xfs_mount *mp,
- struct xfs_dinode *dip,
- struct xfs_inode_log_format *in_f,
- struct list_head *buffer_list)
-{
- struct xfs_inode *ip;
- int error;
-
- ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
-
- ip = xfs_inode_alloc(mp, in_f->ilf_ino);
- if (!ip)
- return -ENOMEM;
-
- /* instantiate the inode */
- xfs_inode_from_disk(ip, dip);
- ASSERT(ip->i_d.di_version >= 3);
-
- error = xfs_iformat_fork(ip, dip);
- if (error)
- goto out_free_ip;
-
- if (!xfs_inode_verify_forks(ip)) {
- error = -EFSCORRUPTED;
- goto out_free_ip;
- }
-
- if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
- ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
- error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
- ip->i_ino, buffer_list);
- if (error)
- goto out_free_ip;
- }
-
- if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
- ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
- error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
- ip->i_ino, buffer_list);
- if (error)
- goto out_free_ip;
- }
-
-out_free_ip:
- xfs_inode_free(ip);
- return error;
-}
-
-STATIC int
-xlog_recover_inode_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- struct xlog_recover_item *item,
- xfs_lsn_t current_lsn)
-{
- struct xfs_inode_log_format *in_f;
- xfs_mount_t *mp = log->l_mp;
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- int len;
- char *src;
- char *dest;
- int error;
- int attr_index;
- uint fields;
- struct xfs_log_dinode *ldip;
- uint isize;
- int need_free = 0;
-
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- in_f = item->ri_buf[0].i_addr;
- } else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
- need_free = 1;
- error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
- if (error)
- goto error;
- }
-
- /*
- * Inode buffers can be freed, look out for it,
- * and do not replay the inode.
- */
- if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
- in_f->ilf_len, 0)) {
- error = 0;
- trace_xfs_log_recover_inode_cancel(log, in_f);
- goto error;
- }
- trace_xfs_log_recover_inode_recover(log, in_f);
-
- error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
- 0, &bp, &xfs_inode_buf_ops);
- if (error)
- goto error;
- ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
- dip = xfs_buf_offset(bp, in_f->ilf_boffset);
-
- /*
- * Make sure the place we're flushing out to really looks
- * like an inode!
- */
- if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
- xfs_alert(mp,
- "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
- __func__, dip, bp, in_f->ilf_ino);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- ldip = item->ri_buf[1].i_addr;
- if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
- __func__, item, in_f->ilf_ino);
- error = -EFSCORRUPTED;
- goto out_release;
- }
-
- /*
- * If the inode has an LSN in it, recover the inode only if it's less
- * than the lsn of the transaction we are replaying. Note: we still
- * need to replay an owner change even though the inode is more recent
- * than the transaction as there is no guarantee that all the btree
- * blocks are more recent than this transaction, too.
- */
- if (dip->di_version >= 3) {
- xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
-
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_owner_change;
- }
- }
-
- /*
- * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
- * are transactional and if ordering is necessary we can determine that
- * more accurately by the LSN field in the V3 inode core. Don't trust
- * the inode versions we might be changing them here - use the
- * superblock flag to determine whether we need to look at di_flushiter
- * to skip replay when the on disk inode is newer than the log one
- */
- if (!xfs_sb_version_hascrc(&mp->m_sb) &&
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
- /*
- * Deal with the wrap case, DI_MAX_FLUSH is less
- * than smaller numbers
- */
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
- /* do nothing */
- } else {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_release;
- }
- }
-
- /* Take the opportunity to reset the flush iteration count */
- ldip->di_flushiter = 0;
-
- if (unlikely(S_ISREG(ldip->di_mode))) {
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- } else if (unlikely(S_ISDIR(ldip->di_mode))) {
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
- (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- }
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
- __func__, item, dip, bp, in_f->ilf_ino,
- ldip->di_nextents + ldip->di_anextents,
- ldip->di_nblocks);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- isize = xfs_log_dinode_size(ldip->di_version);
- if (unlikely(item->ri_buf[1].i_len > isize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
- __func__, item->ri_buf[1].i_len, item);
- error = -EFSCORRUPTED;
- goto out_release;
- }
-
- /* recover the log dinode inode into the on disk inode */
- xfs_log_dinode_to_disk(ldip, dip);
-
- fields = in_f->ilf_fields;
- if (fields & XFS_ILOG_DEV)
- xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
-
- if (in_f->ilf_size == 2)
- goto out_owner_change;
- len = item->ri_buf[2].i_len;
- src = item->ri_buf[2].i_addr;
- ASSERT(in_f->ilf_size <= 4);
- ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
- ASSERT(!(fields & XFS_ILOG_DFORK) ||
- (len == in_f->ilf_dsize));
-
- switch (fields & XFS_ILOG_DFORK) {
- case XFS_ILOG_DDATA:
- case XFS_ILOG_DEXT:
- memcpy(XFS_DFORK_DPTR(dip), src, len);
- break;
-
- case XFS_ILOG_DBROOT:
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
- (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
- XFS_DFORK_DSIZE(dip, mp));
- break;
-
- default:
- /*
- * There are no data fork flags set.
- */
- ASSERT((fields & XFS_ILOG_DFORK) == 0);
- break;
- }
-
- /*
- * If we logged any attribute data, recover it. There may or
- * may not have been any other non-core data logged in this
- * transaction.
- */
- if (in_f->ilf_fields & XFS_ILOG_AFORK) {
- if (in_f->ilf_fields & XFS_ILOG_DFORK) {
- attr_index = 3;
- } else {
- attr_index = 2;
- }
- len = item->ri_buf[attr_index].i_len;
- src = item->ri_buf[attr_index].i_addr;
- ASSERT(len == in_f->ilf_asize);
-
- switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
- case XFS_ILOG_ADATA:
- case XFS_ILOG_AEXT:
- dest = XFS_DFORK_APTR(dip);
- ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
- memcpy(dest, src, len);
- break;
-
- case XFS_ILOG_ABROOT:
- dest = XFS_DFORK_APTR(dip);
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
- len, (xfs_bmdr_block_t*)dest,
- XFS_DFORK_ASIZE(dip, mp));
- break;
-
- default:
- xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
- ASSERT(0);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- }
-
-out_owner_change:
- /* Recover the swapext owner change unless inode has been deleted */
- if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
- (dip->di_mode != 0))
- error = xfs_recover_inode_owner_change(mp, dip, in_f,
- buffer_list);
- /* re-generate the checksum. */
- xfs_dinode_calc_crc(log->l_mp, dip);
-
- ASSERT(bp->b_mount == mp);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp, buffer_list);
-
-out_release:
- xfs_buf_relse(bp);
-error:
- if (need_free)
- kmem_free(in_f);
- return error;
-}
-
-/*
- * Recover QUOTAOFF records. We simply make a note of it in the xlog
- * structure, so that we know not to do any dquot item or dquot buffer recovery,
- * of that type.
- */
-STATIC int
-xlog_recover_quotaoff_pass1(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
- ASSERT(qoff_f);
-
- /*
- * The logitem format's flag tells us if this was user quotaoff,
- * group/project quotaoff or both.
- */
- if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
- log->l_quotaoffs_flag |= XFS_DQ_USER;
- if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
- log->l_quotaoffs_flag |= XFS_DQ_PROJ;
- if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
- log->l_quotaoffs_flag |= XFS_DQ_GROUP;
-
- return 0;
-}
-
-/*
- * Recover a dquot record
- */
-STATIC int
-xlog_recover_dquot_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- struct xlog_recover_item *item,
- xfs_lsn_t current_lsn)
-{
- xfs_mount_t *mp = log->l_mp;
- xfs_buf_t *bp;
- struct xfs_disk_dquot *ddq, *recddq;
- xfs_failaddr_t fa;
- int error;
- xfs_dq_logformat_t *dq_f;
- uint type;
-
-
- /*
- * Filesystems are required to send in quota flags at mount time.
- */
- if (mp->m_qflags == 0)
- return 0;
-
- recddq = item->ri_buf[1].i_addr;
- if (recddq == NULL) {
- xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
- return -EFSCORRUPTED;
- }
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
- item->ri_buf[1].i_len, __func__);
- return -EFSCORRUPTED;
- }
-
- /*
- * This type of quotas was turned off, so ignore this record.
- */
- type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
- ASSERT(type);
- if (log->l_quotaoffs_flag & type)
- return 0;
-
- /*
- * At this point we know that quota was _not_ turned off.
- * Since the mount flags are not indicating to us otherwise, this
- * must mean that quota is on, and the dquot needs to be replayed.
- * Remember that we may not have fully recovered the superblock yet,
- * so we can't do the usual trick of looking at the SB quota bits.
- *
- * The other possibility, of course, is that the quota subsystem was
- * removed since the last mount - ENOSYS.
- */
- dq_f = item->ri_buf[0].i_addr;
- ASSERT(dq_f);
- fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
- if (fa) {
- xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
- dq_f->qlf_id, fa);
- return -EFSCORRUPTED;
- }
- ASSERT(dq_f->qlf_len == 1);
-
- /*
- * At this point we are assuming that the dquots have been allocated
- * and hence the buffer has valid dquots stamped in it. It should,
- * therefore, pass verifier validation. If the dquot is bad, then the
- * we'll return an error here, so we don't need to specifically check
- * the dquot in the buffer after the verifier has run.
- */
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
- &xfs_dquot_buf_ops);
- if (error)
- return error;
-
- ASSERT(bp);
- ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
-
- /*
- * If the dquot has an LSN in it, recover the dquot only if it's less
- * than the lsn of the transaction we are replaying.
- */
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
- xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
-
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- goto out_release;
- }
- }
-
- memcpy(ddq, recddq, item->ri_buf[1].i_len);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
- XFS_DQUOT_CRC_OFF);
- }
-
- ASSERT(dq_f->qlf_size == 2);
- ASSERT(bp->b_mount == mp);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp, buffer_list);
-
-out_release:
- xfs_buf_relse(bp);
- return 0;
-}
-
-/*
- * This routine is called to create an in-core extent free intent
- * item from the efi format structure which was logged on disk.
- * It allocates an in-core efi, copies the extents from the format
- * structure into it, and adds the efi to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_efi_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_efi_log_item *efip;
- struct xfs_efi_log_format *efi_formatp;
-
- efi_formatp = item->ri_buf[0].i_addr;
-
- efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
- error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
- if (error) {
- xfs_efi_item_free(efip);
- return error;
- }
- atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The EFI has two references. One for the EFD and one for EFI to ensure
- * it makes it into the AIL. Insert the EFI into the AIL directly and
- * drop the EFI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
- xfs_efi_release(efip);
- return 0;
-}
-
-
-/*
- * This routine is called when an EFD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding EFI if it
- * was still in the log. To do this it searches the AIL for the EFI with an id
- * equal to that in the EFD format structure. If we find it we drop the EFD
- * reference, which removes the EFI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_efd_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- xfs_efd_log_format_t *efd_formatp;
- xfs_efi_log_item_t *efip = NULL;
- struct xfs_log_item *lip;
- uint64_t efi_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- efd_formatp = item->ri_buf[0].i_addr;
- ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
- (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
- efi_id = efd_formatp->efd_efi_id;
-
- /*
- * Search for the EFI with the id in the EFD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_EFI) {
- efip = (xfs_efi_log_item_t *)lip;
- if (efip->efi_format.efi_id == efi_id) {
- /*
- * Drop the EFD reference to the EFI. This
- * removes the EFI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_efi_release(efip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * This routine is called to create an in-core extent rmap update
- * item from the rui format structure which was logged on disk.
- * It allocates an in-core rui, copies the extents from the format
- * structure into it, and adds the rui to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_rui_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_rui_log_item *ruip;
- struct xfs_rui_log_format *rui_formatp;
-
- rui_formatp = item->ri_buf[0].i_addr;
-
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
- error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
- if (error) {
- xfs_rui_item_free(ruip);
- return error;
- }
- atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The RUI has two references. One for the RUD and one for RUI to ensure
- * it makes it into the AIL. Insert the RUI into the AIL directly and
- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
- xfs_rui_release(ruip);
- return 0;
-}
-
-
-/*
- * This routine is called when an RUD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding RUI if it
- * was still in the log. To do this it searches the AIL for the RUI with an id
- * equal to that in the RUD format structure. If we find it we drop the RUD
- * reference, which removes the RUI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_rud_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_rud_log_format *rud_formatp;
- struct xfs_rui_log_item *ruip = NULL;
- struct xfs_log_item *lip;
- uint64_t rui_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- rud_formatp = item->ri_buf[0].i_addr;
- ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
- rui_id = rud_formatp->rud_rui_id;
-
- /*
- * Search for the RUI with the id in the RUD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_RUI) {
- ruip = (struct xfs_rui_log_item *)lip;
- if (ruip->rui_format.rui_id == rui_id) {
- /*
- * Drop the RUD reference to the RUI. This
- * removes the RUI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_rui_release(ruip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * Copy an CUI format buffer from the given buf, and into the destination
- * CUI format structure. The CUI/CUD items were designed not to need any
- * special alignment handling.
- */
-static int
-xfs_cui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_cui_log_format *dst_cui_fmt)
-{
- struct xfs_cui_log_format *src_cui_fmt;
- uint len;
-
- src_cui_fmt = buf->i_addr;
- len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
-
- if (buf->i_len == len) {
- memcpy(dst_cui_fmt, src_cui_fmt, len);
- return 0;
- }
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
-}
-
-/*
- * This routine is called to create an in-core extent refcount update
- * item from the cui format structure which was logged on disk.
- * It allocates an in-core cui, copies the extents from the format
- * structure into it, and adds the cui to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_cui_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_cui_log_item *cuip;
- struct xfs_cui_log_format *cui_formatp;
-
- cui_formatp = item->ri_buf[0].i_addr;
-
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
- error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
- if (error) {
- xfs_cui_item_free(cuip);
- return error;
- }
- atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The CUI has two references. One for the CUD and one for CUI to ensure
- * it makes it into the AIL. Insert the CUI into the AIL directly and
- * drop the CUI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
- xfs_cui_release(cuip);
- return 0;
-}
-
-
-/*
- * This routine is called when an CUD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding CUI if it
- * was still in the log. To do this it searches the AIL for the CUI with an id
- * equal to that in the CUD format structure. If we find it we drop the CUD
- * reference, which removes the CUI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_cud_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_cud_log_format *cud_formatp;
- struct xfs_cui_log_item *cuip = NULL;
- struct xfs_log_item *lip;
- uint64_t cui_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
- return -EFSCORRUPTED;
- }
- cui_id = cud_formatp->cud_cui_id;
-
- /*
- * Search for the CUI with the id in the CUD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_CUI) {
- cuip = (struct xfs_cui_log_item *)lip;
- if (cuip->cui_format.cui_id == cui_id) {
- /*
- * Drop the CUD reference to the CUI. This
- * removes the CUI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_cui_release(cuip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * Copy an BUI format buffer from the given buf, and into the destination
- * BUI format structure. The BUI/BUD items were designed not to need any
- * special alignment handling.
- */
-static int
-xfs_bui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_bui_log_format *dst_bui_fmt)
+ const struct xfs_buf_ops *ops)
{
- struct xfs_bui_log_format *src_bui_fmt;
- uint len;
-
- src_bui_fmt = buf->i_addr;
- len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
-
- if (buf->i_len == len) {
- memcpy(dst_bui_fmt, src_bui_fmt, len);
- return 0;
- }
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
-}
-
-/*
- * This routine is called to create an in-core extent bmap update
- * item from the bui format structure which was logged on disk.
- * It allocates an in-core bui, copies the extents from the format
- * structure into it, and adds the bui to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_bui_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_bui_log_item *buip;
- struct xfs_bui_log_format *bui_formatp;
-
- bui_formatp = item->ri_buf[0].i_addr;
-
- if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
- return -EFSCORRUPTED;
- }
- buip = xfs_bui_init(mp);
- error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
- if (error) {
- xfs_bui_item_free(buip);
- return error;
- }
- atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The RUI has two references. One for the RUD and one for RUI to ensure
- * it makes it into the AIL. Insert the RUI into the AIL directly and
- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
- xfs_bui_release(buip);
- return 0;
-}
-
-
-/*
- * This routine is called when an BUD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding BUI if it
- * was still in the log. To do this it searches the AIL for the BUI with an id
- * equal to that in the BUD format structure. If we find it we drop the BUD
- * reference, which removes the BUI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_bud_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_bud_log_format *bud_formatp;
- struct xfs_bui_log_item *buip = NULL;
- struct xfs_log_item *lip;
- uint64_t bui_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
- return -EFSCORRUPTED;
- }
- bui_id = bud_formatp->bud_bui_id;
-
- /*
- * Search for the BUI with the id in the BUD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_BUI) {
- buip = (struct xfs_bui_log_item *)lip;
- if (buip->bui_format.bui_id == bui_id) {
- /*
- * Drop the BUD reference to the BUI. This
- * removes the BUI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_bui_release(buip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * This routine is called when an inode create format structure is found in a
- * committed transaction in the log. It's purpose is to initialise the inodes
- * being allocated on disk. This requires us to get inode cluster buffers that
- * match the range to be initialised, stamped with inode templates and written
- * by delayed write so that subsequent modifications will hit the cached buffer
- * and only need writing out at the end of recovery.
- */
-STATIC int
-xlog_recover_do_icreate_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- xlog_recover_item_t *item)
-{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_icreate_log *icl;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- unsigned int count;
- unsigned int isize;
- xfs_agblock_t length;
- int bb_per_cluster;
- int cancel_count;
- int nbufs;
- int i;
-
- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
- if (icl->icl_type != XFS_LI_ICREATE) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
- return -EINVAL;
- }
-
- if (icl->icl_size != 1) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
- return -EINVAL;
- }
-
- agno = be32_to_cpu(icl->icl_ag);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
- return -EINVAL;
- }
- agbno = be32_to_cpu(icl->icl_agbno);
- if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
- return -EINVAL;
- }
- isize = be32_to_cpu(icl->icl_isize);
- if (isize != mp->m_sb.sb_inodesize) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
- return -EINVAL;
- }
- count = be32_to_cpu(icl->icl_count);
- if (!count) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
- return -EINVAL;
- }
- length = be32_to_cpu(icl->icl_length);
- if (!length || length >= mp->m_sb.sb_agblocks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
- return -EINVAL;
- }
-
- /*
- * The inode chunk is either full or sparse and we only support
- * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
- */
- if (length != igeo->ialloc_blks &&
- length != igeo->ialloc_min_blks) {
- xfs_warn(log->l_mp,
- "%s: unsupported chunk length", __FUNCTION__);
- return -EINVAL;
- }
-
- /* verify inode count is consistent with extent length */
- if ((count >> mp->m_sb.sb_inopblog) != length) {
- xfs_warn(log->l_mp,
- "%s: inconsistent inode count and chunk length",
- __FUNCTION__);
- return -EINVAL;
- }
-
- /*
- * The icreate transaction can cover multiple cluster buffers and these
- * buffers could have been freed and reused. Check the individual
- * buffers for cancellation so we don't overwrite anything written after
- * a cancellation.
- */
- bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
- nbufs = length / igeo->blocks_per_cluster;
- for (i = 0, cancel_count = 0; i < nbufs; i++) {
- xfs_daddr_t daddr;
-
- daddr = XFS_AGB_TO_DADDR(mp, agno,
- agbno + i * igeo->blocks_per_cluster);
- if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
- cancel_count++;
- }
-
- /*
- * We currently only use icreate for a single allocation at a time. This
- * means we should expect either all or none of the buffers to be
- * cancelled. Be conservative and skip replay if at least one buffer is
- * cancelled, but warn the user that something is awry if the buffers
- * are not consistent.
- *
- * XXX: This must be refined to only skip cancelled clusters once we use
- * icreate for multiple chunk allocations.
- */
- ASSERT(!cancel_count || cancel_count == nbufs);
- if (cancel_count) {
- if (cancel_count != nbufs)
- xfs_warn(mp,
- "WARNING: partial inode chunk cancellation, skipped icreate.");
- trace_xfs_log_recover_icreate_cancel(log, icl);
- return 0;
- }
-
- trace_xfs_log_recover_icreate_recover(log, icl);
- return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
- length, be32_to_cpu(icl->icl_gen));
-}
-
-STATIC void
-xlog_recover_buffer_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
- struct xfs_mount *mp = log->l_mp;
-
- if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
- buf_f->blf_len, buf_f->blf_flags)) {
- return;
- }
-
- xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
- buf_f->blf_len, NULL);
-}
-
-STATIC void
-xlog_recover_inode_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_inode_log_format ilf_buf;
- struct xfs_inode_log_format *ilfp;
- struct xfs_mount *mp = log->l_mp;
- int error;
-
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- ilfp = item->ri_buf[0].i_addr;
- } else {
- ilfp = &ilf_buf;
- memset(ilfp, 0, sizeof(*ilfp));
- error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
- if (error)
- return;
- }
-
- if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
- return;
-
- xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
- ilfp->ilf_len, &xfs_inode_buf_ra_ops);
-}
-
-STATIC void
-xlog_recover_dquot_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_disk_dquot *recddq;
- struct xfs_dq_logformat *dq_f;
- uint type;
- int len;
-
-
- if (mp->m_qflags == 0)
- return;
-
- recddq = item->ri_buf[1].i_addr;
- if (recddq == NULL)
- return;
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
- return;
-
- type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
- ASSERT(type);
- if (log->l_quotaoffs_flag & type)
- return;
-
- dq_f = item->ri_buf[0].i_addr;
- ASSERT(dq_f);
- ASSERT(dq_f->qlf_len == 1);
-
- len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
- if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
- return;
-
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
- &xfs_dquot_buf_ra_ops);
-}
-
-STATIC void
-xlog_recover_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- xlog_recover_buffer_ra_pass2(log, item);
- break;
- case XFS_LI_INODE:
- xlog_recover_inode_ra_pass2(log, item);
- break;
- case XFS_LI_DQUOT:
- xlog_recover_dquot_ra_pass2(log, item);
- break;
- case XFS_LI_EFI:
- case XFS_LI_EFD:
- case XFS_LI_QUOTAOFF:
- case XFS_LI_RUI:
- case XFS_LI_RUD:
- case XFS_LI_CUI:
- case XFS_LI_CUD:
- case XFS_LI_BUI:
- case XFS_LI_BUD:
- default:
- break;
- }
-}
-
-STATIC int
-xlog_recover_commit_pass1(
- struct xlog *log,
- struct xlog_recover *trans,
- struct xlog_recover_item *item)
-{
- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
-
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- return xlog_recover_buffer_pass1(log, item);
- case XFS_LI_QUOTAOFF:
- return xlog_recover_quotaoff_pass1(log, item);
- case XFS_LI_INODE:
- case XFS_LI_EFI:
- case XFS_LI_EFD:
- case XFS_LI_DQUOT:
- case XFS_LI_ICREATE:
- case XFS_LI_RUI:
- case XFS_LI_RUD:
- case XFS_LI_CUI:
- case XFS_LI_CUD:
- case XFS_LI_BUI:
- case XFS_LI_BUD:
- /* nothing to do in pass 1 */
- return 0;
- default:
- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
- __func__, ITEM_TYPE(item));
- ASSERT(0);
- return -EFSCORRUPTED;
- }
-}
-
-STATIC int
-xlog_recover_commit_pass2(
- struct xlog *log,
- struct xlog_recover *trans,
- struct list_head *buffer_list,
- struct xlog_recover_item *item)
-{
- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
-
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- return xlog_recover_buffer_pass2(log, buffer_list, item,
- trans->r_lsn);
- case XFS_LI_INODE:
- return xlog_recover_inode_pass2(log, buffer_list, item,
- trans->r_lsn);
- case XFS_LI_EFI:
- return xlog_recover_efi_pass2(log, item, trans->r_lsn);
- case XFS_LI_EFD:
- return xlog_recover_efd_pass2(log, item);
- case XFS_LI_RUI:
- return xlog_recover_rui_pass2(log, item, trans->r_lsn);
- case XFS_LI_RUD:
- return xlog_recover_rud_pass2(log, item);
- case XFS_LI_CUI:
- return xlog_recover_cui_pass2(log, item, trans->r_lsn);
- case XFS_LI_CUD:
- return xlog_recover_cud_pass2(log, item);
- case XFS_LI_BUI:
- return xlog_recover_bui_pass2(log, item, trans->r_lsn);
- case XFS_LI_BUD:
- return xlog_recover_bud_pass2(log, item);
- case XFS_LI_DQUOT:
- return xlog_recover_dquot_pass2(log, buffer_list, item,
- trans->r_lsn);
- case XFS_LI_ICREATE:
- return xlog_recover_do_icreate_pass2(log, buffer_list, item);
- case XFS_LI_QUOTAOFF:
- /* nothing to do in pass2 */
- return 0;
- default:
- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
- __func__, ITEM_TYPE(item));
- ASSERT(0);
- return -EFSCORRUPTED;
- }
+ if (!xlog_is_buffer_cancelled(log, blkno, len))
+ xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
}
STATIC int
@@ -4072,8 +1950,12 @@ xlog_recover_items_pass2(
int error = 0;
list_for_each_entry(item, item_list, ri_list) {
- error = xlog_recover_commit_pass2(log, trans,
- buffer_list, item);
+ trace_xfs_log_recover_item_recover(log, trans, item,
+ XLOG_RECOVER_PASS2);
+
+ if (item->ri_ops->commit_pass2)
+ error = item->ri_ops->commit_pass2(log, buffer_list,
+ item, trans->r_lsn);
if (error)
return error;
}
@@ -4110,12 +1992,16 @@ xlog_recover_commit_trans(
return error;
list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
+ trace_xfs_log_recover_item_recover(log, trans, item, pass);
+
switch (pass) {
case XLOG_RECOVER_PASS1:
- error = xlog_recover_commit_pass1(log, trans, item);
+ if (item->ri_ops->commit_pass1)
+ error = item->ri_ops->commit_pass1(log, item);
break;
case XLOG_RECOVER_PASS2:
- xlog_recover_ra_pass2(log, item);
+ if (item->ri_ops->ra_pass2)
+ item->ri_ops->ra_pass2(log, item);
list_move_tail(&item->ri_list, &ra_list);
items_queued++;
if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
@@ -4152,9 +2038,9 @@ STATIC void
xlog_recover_add_item(
struct list_head *head)
{
- xlog_recover_item_t *item;
+ struct xlog_recover_item *item;
- item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
+ item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -4166,7 +2052,7 @@ xlog_recover_add_to_cont_trans(
char *dp,
int len)
{
- xlog_recover_item_t *item;
+ struct xlog_recover_item *item;
char *ptr, *old_ptr;
int old_len;
@@ -4189,12 +2075,15 @@ xlog_recover_add_to_cont_trans(
}
/* take the tail entry */
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
+ ri_list);
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len + old_len, 0);
+ ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4223,7 +2112,7 @@ xlog_recover_add_to_trans(
int len)
{
struct xfs_inode_log_format *in_f; /* any will do */
- xlog_recover_item_t *item;
+ struct xlog_recover_item *item;
char *ptr;
if (!len)
@@ -4259,13 +2148,14 @@ xlog_recover_add_to_trans(
in_f = (struct xfs_inode_log_format *)ptr;
/* take the tail entry */
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
+ ri_list);
if (item->ri_total != 0 &&
item->ri_total == item->ri_cnt) {
/* tail item is in use, get a new one */
xlog_recover_add_item(&trans->r_itemq);
item = list_entry(trans->r_itemq.prev,
- xlog_recover_item_t, ri_list);
+ struct xlog_recover_item, ri_list);
}
if (item->ri_total == 0) { /* first region to be added */
@@ -4311,7 +2201,7 @@ STATIC void
xlog_recover_free_trans(
struct xlog_recover *trans)
{
- xlog_recover_item_t *item, *n;
+ struct xlog_recover_item *item, *n;
int i;
hlist_del_init(&trans->r_list);
@@ -4563,274 +2453,109 @@ xlog_recover_process_data(
return 0;
}
-/* Recover the EFI if necessary. */
-STATIC int
-xlog_recover_process_efi(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_efi_log_item *efip;
- int error;
-
- /*
- * Skip EFIs that we've already processed.
- */
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_efi_recover(mp, efip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the EFI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_efi(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_efi_log_item *efip;
-
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_efi_release(efip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Recover the RUI if necessary. */
-STATIC int
-xlog_recover_process_rui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_rui_log_item *ruip;
- int error;
-
- /*
- * Skip RUIs that we've already processed.
- */
- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
- if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_rui_recover(mp, ruip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the RUI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_rui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_rui_log_item *ruip;
-
- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_rui_release(ruip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Recover the CUI if necessary. */
-STATIC int
-xlog_recover_process_cui(
- struct xfs_trans *parent_tp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_cui_log_item *cuip;
- int error;
-
- /*
- * Skip CUIs that we've already processed.
- */
- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
- if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_cui_recover(parent_tp, cuip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the CUI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_cui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+/* Take all the collected deferred ops and finish them in order. */
+static int
+xlog_finish_defer_ops(
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- struct xfs_cui_log_item *cuip;
+ struct xfs_defer_capture *dfc, *next;
+ struct xfs_trans *tp;
+ int error = 0;
- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ struct xfs_trans_res resv;
+ struct xfs_defer_resources dres;
- spin_unlock(&ailp->ail_lock);
- xfs_cui_release(cuip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Recover the BUI if necessary. */
-STATIC int
-xlog_recover_process_bui(
- struct xfs_trans *parent_tp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_bui_log_item *buip;
- int error;
+ /*
+ * Create a new transaction reservation from the captured
+ * information. Set logcount to 1 to force the new transaction
+ * to regrant every roll so that we can make forward progress
+ * in recovery no matter how full the log might be.
+ */
+ resv.tr_logres = dfc->dfc_logres;
+ resv.tr_logcount = 1;
+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- /*
- * Skip BUIs that we've already processed.
- */
- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
- if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
- return 0;
+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ if (error) {
+ xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+ }
- spin_unlock(&ailp->ail_lock);
- error = xfs_bui_recover(parent_tp, buip);
- spin_lock(&ailp->ail_lock);
+ /*
+ * Transfer to this new transaction all the dfops we captured
+ * from recovering a single intent item.
+ */
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_continue(dfc, tp, &dres);
+ error = xfs_trans_commit(tp);
+ xfs_defer_resources_rele(&dres);
+ if (error)
+ return error;
+ }
- return error;
+ ASSERT(list_empty(capture_list));
+ return 0;
}
-/* Release the BUI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_bui(
+/* Release all the captured defer ops and capture structures in this list. */
+static void
+xlog_abort_defer_ops(
struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct list_head *capture_list)
{
- struct xfs_bui_log_item *buip;
+ struct xfs_defer_capture *dfc;
+ struct xfs_defer_capture *next;
- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_bui_release(buip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Is this log item a deferred action intent? */
-static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
-{
- switch (lip->li_type) {
- case XFS_LI_EFI:
- case XFS_LI_RUI:
- case XFS_LI_CUI:
- case XFS_LI_BUI:
- return true;
- default:
- return false;
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_capture_free(mp, dfc);
}
}
-/* Take all the collected deferred ops and finish them in order. */
-static int
-xlog_finish_defer_ops(
- struct xfs_trans *parent_tp)
-{
- struct xfs_mount *mp = parent_tp->t_mountp;
- struct xfs_trans *tp;
- int64_t freeblks;
- uint resblks;
- int error;
-
- /*
- * We're finishing the defer_ops that accumulated as a result of
- * recovering unfinished intent items during log recovery. We
- * reserve an itruncate transaction because it is the largest
- * permanent transaction type. Since we're the only user of the fs
- * right now, take 93% (15/16) of the available free blocks. Use
- * weird math to avoid a 64-bit division.
- */
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
- if (freeblks <= 0)
- return -ENOSPC;
- resblks = min_t(int64_t, UINT_MAX, freeblks);
- resblks = (resblks * 15) >> 4;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
- /* transfer all collected dfops to this transaction */
- xfs_defer_move(tp, parent_tp);
-
- return xfs_trans_commit(tp);
-}
-
/*
* When this is called, all of the log intent items which did not have
- * corresponding log done items should be in the AIL. What we do now
- * is update the data structures associated with each one.
+ * corresponding log done items should be in the AIL. What we do now is update
+ * the data structures associated with each one.
*
- * Since we process the log intent items in normal transactions, they
- * will be removed at some point after the commit. This prevents us
- * from just walking down the list processing each one. We'll use a
- * flag in the intent item to skip those that we've already processed
- * and use the AIL iteration mechanism's generation count to try to
- * speed this up at least a bit.
+ * Since we process the log intent items in normal transactions, they will be
+ * removed at some point after the commit. This prevents us from just walking
+ * down the list processing each one. We'll use a flag in the intent item to
+ * skip those that we've already processed and use the AIL iteration mechanism's
+ * generation count to try to speed this up at least a bit.
*
- * When we start, we know that the intents are the only things in the
- * AIL. As we process them, however, other items are added to the
- * AIL.
+ * When we start, we know that the intents are the only things in the AIL. As we
+ * process them, however, other items are added to the AIL. Hence we know we
+ * have started recovery on all the pending intents when we find an non-intent
+ * item in the AIL.
*/
STATIC int
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_trans *parent_tp;
+ LIST_HEAD(capture_list);
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
struct xfs_ail *ailp;
- int error;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
- /*
- * The intent recovery handlers commit transactions to complete recovery
- * for individual intents, but any new deferred operations that are
- * queued during that process are held off until the very end. The
- * purpose of this transaction is to serve as a container for deferred
- * operations. Each intent recovery handler must transfer dfops here
- * before its local transaction commits, and we'll finish the entire
- * list below.
- */
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
- if (error)
- return error;
-
ailp = log->l_ailp;
spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- while (lip != NULL) {
- /*
- * We're done when we see something other than an intent.
- * There should be no intents left in the AIL now.
- */
- if (!xlog_item_is_intent(lip)) {
-#ifdef DEBUG
- for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(!xlog_item_is_intent(lip));
-#endif
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ const struct xfs_item_ops *ops;
+
+ if (!xlog_item_is_intent(lip))
break;
- }
/*
* We should never see a redo item with a LSN higher than
@@ -4841,41 +2566,43 @@ xlog_recover_process_intents(
/*
* NOTE: If your intent processing routine can create more
- * deferred ops, you /must/ attach them to the dfops in this
- * routine or else those subsequent intents will get
+ * deferred ops, you /must/ attach them to the capture list in
+ * the recover routine or else those subsequent intents will be
* replayed in the wrong order!
+ *
+ * The recovery function can free the log item, so we must not
+ * access lip after it returns.
*/
- switch (lip->li_type) {
- case XFS_LI_EFI:
- error = xlog_recover_process_efi(log->l_mp, ailp, lip);
- break;
- case XFS_LI_RUI:
- error = xlog_recover_process_rui(log->l_mp, ailp, lip);
- break;
- case XFS_LI_CUI:
- error = xlog_recover_process_cui(parent_tp, ailp, lip);
- break;
- case XFS_LI_BUI:
- error = xlog_recover_process_bui(parent_tp, ailp, lip);
+ spin_unlock(&ailp->ail_lock);
+ ops = lip->li_ops;
+ error = ops->iop_recover(lip, &capture_list);
+ spin_lock(&ailp->ail_lock);
+ if (error) {
+ trace_xlog_intent_recovery_failed(log->l_mp, error,
+ ops->iop_recover);
break;
}
- if (error)
- goto out;
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
-out:
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- if (!error)
- error = xlog_finish_defer_ops(parent_tp);
- xfs_trans_cancel(parent_tp);
+ if (error)
+ goto err;
+
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
+ if (error)
+ goto err;
+ return 0;
+err:
+ xlog_abort_defer_ops(log->l_mp, &capture_list);
return error;
}
/*
- * A cancel occurs when the mount has failed and we're bailing out.
- * Release all pending log intent items so they don't pin the AIL.
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending log intent items that we haven't started recovery on so they don't
+ * pin the AIL.
*/
STATIC void
xlog_recover_cancel_intents(
@@ -4889,33 +2616,12 @@ xlog_recover_cancel_intents(
spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
- /*
- * We're done when we see something other than an intent.
- * There should be no intents left in the AIL now.
- */
- if (!xlog_item_is_intent(lip)) {
-#ifdef DEBUG
- for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
- ASSERT(!xlog_item_is_intent(lip));
-#endif
+ if (!xlog_item_is_intent(lip))
break;
- }
-
- switch (lip->li_type) {
- case XFS_LI_EFI:
- xlog_recover_cancel_efi(log->l_mp, ailp, lip);
- break;
- case XFS_LI_RUI:
- xlog_recover_cancel_rui(log->l_mp, ailp, lip);
- break;
- case XFS_LI_CUI:
- xlog_recover_cancel_cui(log->l_mp, ailp, lip);
- break;
- case XFS_LI_BUI:
- xlog_recover_cancel_bui(log->l_mp, ailp, lip);
- break;
- }
+ spin_unlock(&ailp->ail_lock);
+ lip->li_ops->iop_release(lip);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
@@ -4929,25 +2635,25 @@ xlog_recover_cancel_intents(
*/
STATIC void
xlog_recover_clear_agi_bucket(
- xfs_mount_t *mp,
- xfs_agnumber_t agno,
- int bucket)
+ struct xfs_perag *pag,
+ int bucket)
{
- xfs_trans_t *tp;
- xfs_agi_t *agi;
- xfs_buf_t *agibp;
- int offset;
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_trans *tp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ int offset;
+ int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
if (error)
goto out_error;
- error = xfs_read_agi(mp, tp, agno, &agibp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
goto out_abort;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket);
@@ -4962,65 +2668,62 @@ xlog_recover_clear_agi_bucket(
out_abort:
xfs_trans_cancel(tp);
out_error:
- xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+ xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
+ pag->pag_agno);
return;
}
-STATIC xfs_agino_t
-xlog_recover_process_one_iunlink(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- int bucket)
+static int
+xlog_recover_iunlink_bucket(
+ struct xfs_perag *pag,
+ struct xfs_agi *agi,
+ int bucket)
{
- struct xfs_buf *ibp;
- struct xfs_dinode *dip;
- struct xfs_inode *ip;
- xfs_ino_t ino;
- int error;
-
- ino = XFS_AGINO_TO_INO(mp, agno, agino);
- error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
- if (error)
- goto fail;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *prev_ip = NULL;
+ struct xfs_inode *ip;
+ xfs_agino_t prev_agino, agino;
+ int error = 0;
- /*
- * Get the on disk inode to find the next inode in the bucket.
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
- if (error)
- goto fail_iput;
+ agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (agino != NULLAGINO) {
+ error = xfs_iget(mp, NULL,
+ XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
+ 0, 0, &ip);
+ if (error)
+ break;
- xfs_iflags_clear(ip, XFS_IRECOVERY);
- ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(VFS_I(ip)->i_mode != 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ xfs_iflags_clear(ip, XFS_IRECOVERY);
+ agino = ip->i_next_unlinked;
- /* setup for the next pass */
- agino = be32_to_cpu(dip->di_next_unlinked);
- xfs_buf_relse(ibp);
+ if (prev_ip) {
+ ip->i_prev_unlinked = prev_agino;
+ xfs_irele(prev_ip);
- /*
- * Prevent any DMAPI event from being sent when the reference on
- * the inode is dropped.
- */
- ip->i_d.di_dmevmask = 0;
+ /*
+ * Ensure the inode is removed from the unlinked list
+ * before we continue so that it won't race with
+ * building the in-memory list here. This could be
+ * serialised with the agibp lock, but that just
+ * serialises via lockstepping and it's much simpler
+ * just to flush the inodegc queue and wait for it to
+ * complete.
+ */
+ xfs_inodegc_flush(mp);
+ }
- xfs_irele(ip);
- return agino;
+ prev_agino = agino;
+ prev_ip = ip;
+ }
- fail_iput:
- xfs_irele(ip);
- fail:
- /*
- * We can't read in the inode this bucket points to, or this inode
- * is messed up. Just ditch this bucket of inodes. We will lose
- * some inodes and space, but at least we won't hang.
- *
- * Call xlog_recover_clear_agi_bucket() to perform a transaction to
- * clear the inode pointer in the bucket.
- */
- xlog_recover_clear_agi_bucket(mp, agno, bucket);
- return NULLAGINO;
+ if (prev_ip) {
+ ip->i_prev_unlinked = prev_agino;
+ xfs_irele(prev_ip);
+ }
+ xfs_inodegc_flush(mp);
+ return error;
}
/*
@@ -5040,62 +2743,76 @@ xlog_recover_process_one_iunlink(
* of log space.
*
* This behaviour is bad for latency on single CPU and non-preemptible kernels,
- * and can prevent other filesytem work (such as CIL pushes) from running. This
+ * and can prevent other filesystem work (such as CIL pushes) from running. This
* can lead to deadlocks if the recovery process runs out of log reservation
* space. Hence we need to yield the CPU when there is other kernel work
* scheduled on this CPU to ensure other scheduled work can run without undue
* latency.
*/
-STATIC void
-xlog_recover_process_iunlinks(
- struct xlog *log)
+static void
+xlog_recover_iunlink_ag(
+ struct xfs_perag *pag)
{
- xfs_mount_t *mp;
- xfs_agnumber_t agno;
- xfs_agi_t *agi;
- xfs_buf_t *agibp;
- xfs_agino_t agino;
- int bucket;
- int error;
-
- mp = log->l_mp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ int bucket;
+ int error;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_read_agi(pag, NULL, &agibp);
+ if (error) {
/*
- * Find the agi for this ag.
+ * AGI is b0rked. Don't process it.
+ *
+ * We should probably mark the filesystem as corrupt after we've
+ * recovered all the ag's we can....
*/
- error = xfs_read_agi(mp, NULL, agno, &agibp);
+ return;
+ }
+
+ /*
+ * Unlock the buffer so that it can be acquired in the normal course of
+ * the transaction to truncate and free each inode. Because we are not
+ * racing with anyone else here for the AGI buffer, we don't even need
+ * to hold it locked to read the initial unlinked bucket entries out of
+ * the buffer. We keep buffer reference though, so that it stays pinned
+ * in memory while we need the buffer.
+ */
+ agi = agibp->b_addr;
+ xfs_buf_unlock(agibp);
+
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+ error = xlog_recover_iunlink_bucket(pag, agi, bucket);
if (error) {
/*
- * AGI is b0rked. Don't process it.
- *
- * We should probably mark the filesystem as corrupt
- * after we've recovered all the ag's we can....
+ * Bucket is unrecoverable, so only a repair scan can
+ * free the remaining unlinked inodes. Just empty the
+ * bucket and remaining inodes on it unreferenced and
+ * unfreeable.
*/
- continue;
- }
- /*
- * Unlock the buffer so that it can be acquired in the normal
- * course of the transaction to truncate and free each inode.
- * Because we are not racing with anyone else here for the AGI
- * buffer, we don't even need to hold it locked to read the
- * initial unlinked bucket entries out of the buffer. We keep
- * buffer reference though, so that it stays pinned in memory
- * while we need the buffer.
- */
- agi = XFS_BUF_TO_AGI(agibp);
- xfs_buf_unlock(agibp);
-
- for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
- agino = be32_to_cpu(agi->agi_unlinked[bucket]);
- while (agino != NULLAGINO) {
- agino = xlog_recover_process_one_iunlink(mp,
- agno, agino, bucket);
- cond_resched();
- }
+ xfs_inodegc_flush(pag->pag_mount);
+ xlog_recover_clear_agi_bucket(pag, bucket);
}
- xfs_buf_rele(agibp);
}
+
+ xfs_buf_rele(agibp);
+}
+
+static void
+xlog_recover_process_iunlinks(
+ struct xlog *log)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag(log->l_mp, agno, pag)
+ xlog_recover_iunlink_ag(pag);
+
+ /*
+ * Flush the pending unlinked inodes to ensure that the inactivations
+ * are fully completed on disk and the incore inodes can be reclaimed
+ * before we signal that recovery is complete.
+ */
+ xfs_inodegc_flush(log->l_mp);
}
STATIC void
@@ -5112,7 +2829,7 @@ xlog_unpack_data(
dp += BBSIZE;
}
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -5160,7 +2877,7 @@ xlog_recover_process(
* the kernel from one that does not add CRCs by default.
*/
if (crc != old_crc) {
- if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ if (old_crc || xfs_has_crc(log->l_mp)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
le32_to_cpu(old_crc),
@@ -5172,7 +2889,7 @@ xlog_recover_process(
* If the filesystem is CRC enabled, this mismatch becomes a
* fatal log corruption failure.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ if (xfs_has_crc(log->l_mp)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
@@ -5188,7 +2905,8 @@ STATIC int
xlog_valid_rec_header(
struct xlog *log,
struct xlog_rec_header *rhead,
- xfs_daddr_t blkno)
+ xfs_daddr_t blkno,
+ int bufsize)
{
int hlen;
@@ -5204,10 +2922,14 @@ xlog_valid_rec_header(
return -EFSCORRUPTED;
}
- /* LR body must have data or it wouldn't have been written */
+ /*
+ * LR body must have data (or it wouldn't have been written)
+ * and h_len must not be greater than LR buffer size.
+ */
hlen = be32_to_cpu(rhead->h_len);
- if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX))
+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
return -EFSCORRUPTED;
+
if (XFS_IS_CORRUPT(log->l_mp,
blkno > log->l_logBBsize || blkno > INT_MAX))
return -EFSCORRUPTED;
@@ -5253,7 +2975,7 @@ xlog_do_recovery_pass(
* Read the header of the tail block and get the iclog buffer size from
* h_size. Use this to tell how many sectors make up the log header.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ if (xfs_has_logv2(log->l_mp)) {
/*
* When using variable length iclogs, read first sector of
* iclog header and extract the header size from it. Get a
@@ -5268,9 +2990,6 @@ xlog_do_recovery_pass(
goto bread_err1;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, tail_blk);
- if (error)
- goto bread_err1;
/*
* xfsprogs has a bug where record length is based on lsunit but
@@ -5285,30 +3004,22 @@ xlog_do_recovery_pass(
*/
h_size = be32_to_cpu(rhead->h_size);
h_len = be32_to_cpu(rhead->h_len);
- if (h_len > h_size) {
- if (h_len <= log->l_mp->m_logbsize &&
- be32_to_cpu(rhead->h_num_logops) == 1) {
- xfs_warn(log->l_mp,
+ if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
+ rhead->h_num_logops == cpu_to_be32(1)) {
+ xfs_warn(log->l_mp,
"invalid iclog size (%d bytes), using lsunit (%d bytes)",
- h_size, log->l_mp->m_logbsize);
- h_size = log->l_mp->m_logbsize;
- } else {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
- log->l_mp);
- error = -EFSCORRUPTED;
- goto bread_err1;
- }
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
}
- if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
+ error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
+ if (error)
+ goto bread_err1;
+
+ hblks = xlog_logrec_hblks(log, rhead);
+ if (hblks != 1) {
kmem_free(hbp);
hbp = xlog_alloc_buffer(log, hblks);
- } else {
- hblks = 1;
}
} else {
ASSERT(log->l_sectBBsize == 1);
@@ -5380,7 +3091,7 @@ xlog_do_recovery_pass(
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
- split_hblks ? blk_no : 0);
+ split_hblks ? blk_no : 0, h_size);
if (error)
goto bread_err2;
@@ -5461,7 +3172,7 @@ xlog_do_recovery_pass(
goto bread_err2;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, blk_no);
+ error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
if (error)
goto bread_err2;
@@ -5531,7 +3242,7 @@ xlog_do_log_recovery(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
- int error, i;
+ int error;
ASSERT(head_blk != tail_blk);
@@ -5539,37 +3250,25 @@ xlog_do_log_recovery(
* First do a pass to find all of the cancelled buf log items.
* Store them in the buf_cancel_table for use in the second pass.
*/
- log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
- sizeof(struct list_head),
- 0);
- for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+ error = xlog_alloc_buf_cancel_table(log);
+ if (error)
+ return error;
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS1, NULL);
- if (error != 0) {
- kmem_free(log->l_buf_cancel_table);
- log->l_buf_cancel_table = NULL;
- return error;
- }
+ if (error != 0)
+ goto out_cancel;
+
/*
* Then do a second pass to actually recover the items in the log.
* When it is complete free the table of buf cancel items.
*/
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS2, NULL);
-#ifdef DEBUG
- if (!error) {
- int i;
-
- for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- ASSERT(list_empty(&log->l_buf_cancel_table[i]));
- }
-#endif /* DEBUG */
-
- kmem_free(log->l_buf_cancel_table);
- log->l_buf_cancel_table = NULL;
-
+ if (!error)
+ xlog_check_buf_cancel_table(log);
+out_cancel:
+ xlog_free_buf_cancel_table(log);
return error;
}
@@ -5578,14 +3277,14 @@ xlog_do_log_recovery(
*/
STATIC int
xlog_do_recover(
- struct xlog *log,
- xfs_daddr_t head_blk,
- xfs_daddr_t tail_blk)
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
- xfs_buf_t *bp;
- xfs_sb_t *sbp;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp = mp->m_sb_bp;
+ struct xfs_sb *sbp = &mp->m_sb;
+ int error;
trace_xfs_log_recover(log, head_blk, tail_blk);
@@ -5596,12 +3295,8 @@ xlog_do_recover(
if (error)
return error;
- /*
- * If IO errors happened during recovery, bail out.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xlog_is_shutdown(log))
return -EIO;
- }
/*
* We now update the tail_lsn since much of the recovery has completed
@@ -5615,18 +3310,14 @@ xlog_do_recover(
xlog_assign_tail_lsn(mp);
/*
- * Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock and reverify it.
+ * Now that we've finished replaying all buffer and inode updates,
+ * re-read the superblock and reverify it.
*/
- bp = xfs_getsb(mp);
- bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
- ASSERT(!(bp->b_flags & XBF_WRITE));
- bp->b_flags |= XBF_READ;
- bp->b_ops = &xfs_sb_buf_ops;
-
- error = xfs_buf_submit(bp);
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ error = _xfs_buf_read(bp, XBF_READ);
if (error) {
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (!xlog_is_shutdown(log)) {
xfs_buf_ioerror_alert(bp, __this_address);
ASSERT(0);
}
@@ -5635,23 +3326,22 @@ xlog_do_recover(
}
/* Convert superblock from on-disk format */
- sbp = &mp->m_sb;
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, bp->b_addr);
xfs_buf_relse(bp);
/* re-initialise in-core superblock and geometry structures */
+ mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
- error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
+ &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
return error;
}
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
- xlog_recover_check_summary(log);
-
/* Normal transactions can now occur */
- log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
return 0;
}
@@ -5677,7 +3367,7 @@ xlog_recover(
* could not be verified. Check the superblock LSN against the current
* LSN now that it's known.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ if (xfs_has_crc(log->l_mp) &&
!xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
return -EINVAL;
@@ -5704,7 +3394,7 @@ xlog_recover(
* (e.g. unsupported transactions, then simply reject the
* attempt at recovery before touching anything.
*/
- if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+ if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
xfs_warn(log->l_mp,
@@ -5720,7 +3410,7 @@ xlog_recover(
/*
* Delay log recovery if the debug hook is set. This is debug
- * instrumention to coordinate simulation of I/O failures with
+ * instrumentation to coordinate simulation of I/O failures with
* log recovery.
*/
if (xfs_globals.log_recovery_delay) {
@@ -5735,59 +3425,85 @@ xlog_recover(
: "internal");
error = xlog_do_recover(log, head_blk, tail_blk);
- log->l_flags |= XLOG_RECOVERY_NEEDED;
+ set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
}
return error;
}
/*
- * In the first part of recovery we replay inodes and buffers and build
- * up the list of extent free items which need to be processed. Here
- * we process the extent free items and clean up the on disk unlinked
- * inode lists. This is separated from the first part of recovery so
- * that the root and real-time bitmap inodes can be read in from disk in
- * between the two stages. This is necessary so that we can free space
- * in the real-time portion of the file system.
+ * In the first part of recovery we replay inodes and buffers and build up the
+ * list of intents which need to be processed. Here we process the intents and
+ * clean up the on disk unlinked inode lists. This is separated from the first
+ * part of recovery so that the root and real-time bitmap inodes can be read in
+ * from disk in between the two stages. This is necessary so that we can free
+ * space in the real-time portion of the file system.
*/
int
xlog_recover_finish(
struct xlog *log)
{
+ int error;
+
+ error = xlog_recover_process_intents(log);
+ if (error) {
+ /*
+ * Cancel all the unprocessed intent items now so that we don't
+ * leave them pinned in the AIL. This can cause the AIL to
+ * livelock on the pinned item if anyone tries to push the AIL
+ * (inode reclaim does this) before we get around to
+ * xfs_log_mount_cancel.
+ */
+ xlog_recover_cancel_intents(log);
+ xfs_alert(log->l_mp, "Failed to recover intents");
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+ }
+
/*
- * Now we're ready to do the transactions needed for the
- * rest of recovery. Start with completing all the extent
- * free intent records and then process the unlinked inode
- * lists. At this point, we essentially run in normal mode
- * except that we're still performing recovery actions
- * rather than accepting new requests.
+ * Sync the log to get all the intents out of the AIL. This isn't
+ * absolutely necessary, but it helps in case the unlink transactions
+ * would have problems pushing the intents out of the way.
*/
- if (log->l_flags & XLOG_RECOVERY_NEEDED) {
- int error;
- error = xlog_recover_process_intents(log);
- if (error) {
- xfs_alert(log->l_mp, "Failed to recover intents");
+ xfs_log_force(log->l_mp, XFS_LOG_SYNC);
+
+ /*
+ * Now that we've recovered the log and all the intents, we can clear
+ * the log incompat feature bits in the superblock because there's no
+ * longer anything to protect. We rely on the AIL push to write out the
+ * updated superblock after everything else.
+ */
+ if (xfs_clear_incompat_log_features(log->l_mp)) {
+ error = xfs_sync_sb(log->l_mp, false);
+ if (error < 0) {
+ xfs_alert(log->l_mp,
+ "Failed to clear log incompat features on recovery");
return error;
}
+ }
+ xlog_recover_process_iunlinks(log);
+
+ /*
+ * Recover any CoW staging blocks that are still referenced by the
+ * ondisk refcount metadata. During mount there cannot be any live
+ * staging extents as we have not permitted any user modifications.
+ * Therefore, it is safe to free them all right now, even on a
+ * read-only mount.
+ */
+ error = xfs_reflink_recover_cow(log->l_mp);
+ if (error) {
+ xfs_alert(log->l_mp,
+ "Failed to recover leftover CoW staging extents, err %d.",
+ error);
/*
- * Sync the log to get all the intents out of the AIL.
- * This isn't absolutely necessary, but it helps in
- * case the unlink transactions would have problems
- * pushing the intents out of the way.
+ * If we get an error here, make sure the log is shut down
+ * but return zero so that any log items committed since the
+ * end of intents processing can be pushed through the CIL
+ * and AIL.
*/
- xfs_log_force(log->l_mp, XFS_LOG_SYNC);
-
- xlog_recover_process_iunlinks(log);
-
- xlog_recover_check_summary(log);
-
- xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
- log->l_mp->m_logname ? log->l_mp->m_logname
- : "internal");
- log->l_flags &= ~XLOG_RECOVERY_NEEDED;
- } else {
- xfs_info(log->l_mp, "Ending clean mount");
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
+
return 0;
}
@@ -5795,57 +3511,7 @@ void
xlog_recover_cancel(
struct xlog *log)
{
- if (log->l_flags & XLOG_RECOVERY_NEEDED)
+ if (xlog_recovery_needed(log))
xlog_recover_cancel_intents(log);
}
-#if defined(DEBUG)
-/*
- * Read all of the agf and agi counters and check that they
- * are consistent with the superblock counters.
- */
-STATIC void
-xlog_recover_check_summary(
- struct xlog *log)
-{
- xfs_mount_t *mp;
- xfs_agf_t *agfp;
- xfs_buf_t *agfbp;
- xfs_buf_t *agibp;
- xfs_agnumber_t agno;
- uint64_t freeblks;
- uint64_t itotal;
- uint64_t ifree;
- int error;
-
- mp = log->l_mp;
-
- freeblks = 0LL;
- itotal = 0LL;
- ifree = 0LL;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
- if (error) {
- xfs_alert(mp, "%s agf read failed agno %d error %d",
- __func__, agno, error);
- } else {
- agfp = XFS_BUF_TO_AGF(agfbp);
- freeblks += be32_to_cpu(agfp->agf_freeblks) +
- be32_to_cpu(agfp->agf_flcount);
- xfs_buf_relse(agfbp);
- }
-
- error = xfs_read_agi(mp, NULL, agno, &agibp);
- if (error) {
- xfs_alert(mp, "%s agi read failed agno %d error %d",
- __func__, agno, error);
- } else {
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
-
- itotal += be32_to_cpu(agi->agi_count);
- ifree += be32_to_cpu(agi->agi_freecount);
- xfs_buf_relse(agibp);
- }
- }
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index e0f9d3b6abe9..8f495cc23903 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -27,42 +27,34 @@ __xfs_printk(
printk("%sXFS: %pV\n", level, vaf);
}
-#define define_xfs_printk_level(func, kern_level) \
-void func(const struct xfs_mount *mp, const char *fmt, ...) \
-{ \
- struct va_format vaf; \
- va_list args; \
- int level; \
- \
- va_start(args, fmt); \
- \
- vaf.fmt = fmt; \
- vaf.va = &args; \
- \
- __xfs_printk(kern_level, mp, &vaf); \
- va_end(args); \
- \
- if (!kstrtoint(kern_level, 0, &level) && \
- level <= LOGLEVEL_ERR && \
- xfs_error_level >= XFS_ERRLEVEL_HIGH) \
- xfs_stack_trace(); \
-} \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
+void
+xfs_printk_level(
+ const char *kern_level,
+ const struct xfs_mount *mp,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int level;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ __xfs_printk(kern_level, mp, &vaf);
+
+ va_end(args);
+
+ if (!kstrtoint(kern_level, 0, &level) &&
+ level <= LOGLEVEL_ERR &&
+ xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
void
-xfs_alert_tag(
+_xfs_alert_tag(
const struct xfs_mount *mp,
- int panic_tag,
+ uint32_t panic_tag,
const char *fmt, ...)
{
struct va_format vaf;
@@ -117,3 +109,25 @@ xfs_hex_dump(const void *p, int length)
{
print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
}
+
+void
+xfs_buf_alert_ratelimited(
+ struct xfs_buf *bp,
+ const char *rlmsg,
+ const char *fmt,
+ ...)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct va_format vaf;
+ va_list args;
+
+ /* use the more aggressive per-target rate limit for buffers */
+ if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ __xfs_printk(KERN_ALERT, mp, &vaf);
+ va_end(args);
+}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 0b05e10995a0..cc323775a12c 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -2,44 +2,62 @@
#ifndef __XFS_MESSAGE_H
#define __XFS_MESSAGE_H 1
+#include <linux/once_lite.h>
+
struct xfs_mount;
-extern __printf(2, 3)
-void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
extern __printf(3, 4)
-void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+ const char *fmt, ...);
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \
+ xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \
+})
+#define xfs_emerg(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
#ifdef DEBUG
-extern __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#define xfs_debug(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
#else
-static inline __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
+#define xfs_debug(mp, fmt, ...) do {} while (0)
#endif
-#define xfs_printk_ratelimited(func, dev, fmt, ...) \
+#define xfs_alert_tag(mp, tag, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \
+ _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+ const char *fmt, ...);
+
+#define xfs_printk_ratelimited(func, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
if (__ratelimit(&_rs)) \
- func(dev, fmt, ##__VA_ARGS__); \
+ func(dev, fmt, ##__VA_ARGS__); \
} while (0)
+#define xfs_printk_once(func, dev, fmt, ...) \
+ DO_ONCE_LITE(func, dev, fmt, ##__VA_ARGS__)
+
#define xfs_emerg_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
#define xfs_alert_ratelimited(dev, fmt, ...) \
@@ -57,9 +75,25 @@ do { \
#define xfs_debug_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_mount(mp, warntag, fmt, ...) \
+do { \
+ if (xfs_should_warn((mp), (warntag))) \
+ xfs_warn((mp), (fmt), ##__VA_ARGS__); \
+} while (0)
+
+#define xfs_warn_once(dev, fmt, ...) \
+ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_once(dev, fmt, ...) \
+ xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_once(dev, fmt, ...) \
+ xfs_printk_once(xfs_info, dev, fmt, ##__VA_ARGS__)
+
void assfail(struct xfs_mount *mp, char *expr, char *f, int l);
void asswarn(struct xfs_mount *mp, char *expr, char *f, int l);
extern void xfs_hex_dump(const void *p, int length);
+void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
+ const char *fmt, ...);
+
#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 56efe140c923..e8bb3c2e847e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -21,6 +21,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
@@ -32,6 +33,7 @@
#include "xfs_extent_busy.h"
#include "xfs_health.h"
#include "xfs_trace.h"
+#include "xfs_ag.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -61,7 +63,7 @@ xfs_uuid_mount(
/* Publish UUID in struct super_block */
uuid_copy(&mp->m_super->s_uuid, uuid);
- if (mp->m_flags & XFS_MOUNT_NOUUID)
+ if (xfs_has_nouuid(mp))
return 0;
if (uuid_is_null(uuid)) {
@@ -80,9 +82,9 @@ xfs_uuid_mount(
}
if (hole < 0) {
- xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+ xfs_uuid_table = krealloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- 0);
+ GFP_KERNEL | __GFP_NOFAIL);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
@@ -103,7 +105,7 @@ xfs_uuid_unmount(
uuid_t *uuid = &mp->m_sb.sb_uuid;
int i;
- if (mp->m_flags & XFS_MOUNT_NOUUID)
+ if (xfs_has_nouuid(mp))
return;
mutex_lock(&xfs_uuid_table_mutex);
@@ -119,40 +121,6 @@ xfs_uuid_unmount(
mutex_unlock(&xfs_uuid_table_mutex);
}
-
-STATIC void
-__xfs_free_perag(
- struct rcu_head *head)
-{
- struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
-
- ASSERT(atomic_read(&pag->pag_ref) == 0);
- kmem_free(pag);
-}
-
-/*
- * Free up the per-ag resources associated with the mount structure.
- */
-STATIC void
-xfs_free_perag(
- xfs_mount_t *mp)
-{
- xfs_agnumber_t agno;
- struct xfs_perag *pag;
-
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- spin_lock(&mp->m_perag_lock);
- pag = radix_tree_delete(&mp->m_perag_tree, agno);
- spin_unlock(&mp->m_perag_lock);
- ASSERT(pag);
- ASSERT(atomic_read(&pag->pag_ref) == 0);
- xfs_iunlink_destroy(pag);
- xfs_buf_hash_destroy(pag);
- mutex_destroy(&pag->pag_ici_reclaim_lock);
- call_rcu(&pag->rcu_head, __xfs_free_perag);
- }
-}
-
/*
* Check size of device based on the (data/realtime) block count.
* Note: this check is used by the growfs code as well as mount.
@@ -171,93 +139,6 @@ xfs_sb_validate_fsb_count(
return 0;
}
-int
-xfs_initialize_perag(
- xfs_mount_t *mp,
- xfs_agnumber_t agcount,
- xfs_agnumber_t *maxagi)
-{
- xfs_agnumber_t index;
- xfs_agnumber_t first_initialised = NULLAGNUMBER;
- xfs_perag_t *pag;
- int error = -ENOMEM;
-
- /*
- * Walk the current per-ag tree so we don't try to initialise AGs
- * that already exist (growfs case). Allocate and insert all the
- * AGs we don't find ready for initialisation.
- */
- for (index = 0; index < agcount; index++) {
- pag = xfs_perag_get(mp, index);
- if (pag) {
- xfs_perag_put(pag);
- continue;
- }
-
- pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
- if (!pag)
- goto out_unwind_new_pags;
- pag->pag_agno = index;
- pag->pag_mount = mp;
- spin_lock_init(&pag->pag_ici_lock);
- mutex_init(&pag->pag_ici_reclaim_lock);
- INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- if (xfs_buf_hash_init(pag))
- goto out_free_pag;
- init_waitqueue_head(&pag->pagb_wait);
- spin_lock_init(&pag->pagb_lock);
- pag->pagb_count = 0;
- pag->pagb_tree = RB_ROOT;
-
- if (radix_tree_preload(GFP_NOFS))
- goto out_hash_destroy;
-
- spin_lock(&mp->m_perag_lock);
- if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
- WARN_ON_ONCE(1);
- spin_unlock(&mp->m_perag_lock);
- radix_tree_preload_end();
- error = -EEXIST;
- goto out_hash_destroy;
- }
- spin_unlock(&mp->m_perag_lock);
- radix_tree_preload_end();
- /* first new pag is fully initialized */
- if (first_initialised == NULLAGNUMBER)
- first_initialised = index;
- error = xfs_iunlink_init(pag);
- if (error)
- goto out_hash_destroy;
- spin_lock_init(&pag->pag_state_lock);
- }
-
- index = xfs_set_inode_alloc(mp, agcount);
-
- if (maxagi)
- *maxagi = index;
-
- mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
- return 0;
-
-out_hash_destroy:
- xfs_buf_hash_destroy(pag);
-out_free_pag:
- mutex_destroy(&pag->pag_ici_reclaim_lock);
- kmem_free(pag);
-out_unwind_new_pags:
- /* unwind any prior newly initialized pags */
- for (index = first_initialised; index < agcount; index++) {
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- if (!pag)
- break;
- xfs_buf_hash_destroy(pag);
- xfs_iunlink_destroy(pag);
- mutex_destroy(&pag->pag_ici_reclaim_lock);
- kmem_free(pag);
- }
- return error;
-}
-
/*
* xfs_readsb
*
@@ -310,7 +191,7 @@ reread:
/*
* Initialize the mount structure from the superblock.
*/
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, bp->b_addr);
/*
* If we haven't validated the superblock, do so now before we try
@@ -345,6 +226,7 @@ reread:
goto reread;
}
+ mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
/* no need to be quiet anymore, so reset the buf ops */
@@ -418,27 +300,29 @@ xfs_validate_new_dalign(
"alignment check failed: sunit/swidth vs. blocksize(%d)",
mp->m_sb.sb_blocksize);
return -EINVAL;
- } else {
- /*
- * Convert the stripe unit and width to FSBs.
- */
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
- if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
- xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. agsize(%d)",
- mp->m_sb.sb_agblocks);
- return -EINVAL;
- } else if (mp->m_dalign) {
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
- } else {
- xfs_warn(mp,
- "alignment check failed: sunit(%d) less than bsize(%d)",
- mp->m_dalign, mp->m_sb.sb_blocksize);
- return -EINVAL;
- }
}
- if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
+ /*
+ * Convert the stripe unit and width to FSBs.
+ */
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ mp->m_sb.sb_agblocks);
+ return -EINVAL;
+ }
+
+ if (!mp->m_dalign) {
+ xfs_warn(mp,
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, mp->m_sb.sb_blocksize);
+ return -EINVAL;
+ }
+
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+
+ if (!xfs_has_dalign(mp)) {
xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
return -EINVAL;
@@ -469,8 +353,7 @@ xfs_update_alignment(
sbp->sb_unit = mp->m_dalign;
sbp->sb_width = mp->m_swidth;
mp->m_update_sb = true;
- } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
- xfs_sb_version_hasdalign(&mp->m_sb)) {
+ } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
mp->m_dalign = sbp->sb_unit;
mp->m_swidth = sbp->sb_width;
}
@@ -485,13 +368,16 @@ void
xfs_set_low_space_thresholds(
struct xfs_mount *mp)
{
- int i;
+ uint64_t dblocks = mp->m_sb.sb_dblocks;
+ uint64_t rtexts = mp->m_sb.sb_rextents;
+ int i;
- for (i = 0; i < XFS_LOWSP_MAX; i++) {
- uint64_t space = mp->m_sb.sb_dblocks;
+ do_div(dblocks, 100);
+ do_div(rtexts, 100);
- do_div(space, 100);
- mp->m_low_space[i] = space * (i + 1);
+ for (i = 0; i < XFS_LOWSP_MAX; i++) {
+ mp->m_low_space[i] = dblocks * (i + 1);
+ mp->m_low_rtexts[i] = rtexts * (i + 1);
}
}
@@ -584,6 +470,8 @@ STATIC int
xfs_check_summary_counts(
struct xfs_mount *mp)
{
+ int error = 0;
+
/*
* The AG0 superblock verifier rejects in-progress filesystems,
* so we should never see the flag set this far into mounting.
@@ -605,7 +493,7 @@ xfs_check_summary_counts(
* counters. If any of them are obviously incorrect, we can recompute
* them from the AGF headers in the next step.
*/
- if (XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
+ if (xfs_is_clean(mp) &&
(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
!xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
@@ -622,12 +510,99 @@ xfs_check_summary_counts(
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
- if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
- XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
- !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
- return 0;
+ if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+ xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+ error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Older kernels misused sb_frextents to reflect both incore
+ * reservations made by running transactions and the actual count of
+ * free rt extents in the ondisk metadata. Transactions committed
+ * during runtime can therefore contain a superblock update that
+ * undercounts the number of free rt extents tracked in the rt bitmap.
+ * A clean unmount record will have the correct frextents value since
+ * there can be no other transactions running at that point.
+ *
+ * If we're mounting the rt volume after recovering the log, recompute
+ * frextents from the rtbitmap file to fix the inconsistency.
+ */
+ if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ error = xfs_rtalloc_reinit_frextents(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+ * so we need to unpin them, write them back and/or reclaim them before unmount
+ * can proceed. In other words, callers are required to have inactivated all
+ * inodes.
+ *
+ * An inode cluster that has been freed can have its buffer still pinned in
+ * memory because the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will be pinned to the buffer until the transaction hits the
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
+ * may never see the pinned buffer, so nothing will push out the iclog and
+ * unpin the buffer.
+ *
+ * Hence we need to force the log to unpin everything first. However, log
+ * forces don't wait for the discards they issue to complete, so we have to
+ * explicitly wait for them to complete here as well.
+ *
+ * Then we can tell the world we are unmounting so that error handling knows
+ * that the filesystem is going away and we should error out anything that we
+ * have been retrying in the background. This will prevent never-ending
+ * retries in AIL pushing from hanging the unmount.
+ *
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
+ * reclaim the remaining inodes that are still in memory at this point in time.
+ */
+static void
+xfs_unmount_flush_inodes(
+ struct xfs_mount *mp)
+{
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_extent_busy_wait_all(mp);
+ flush_workqueue(xfs_discard_wq);
+
+ set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate);
+
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_inodegc_stop(mp);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp);
+ xfs_health_unmount(mp);
+}
- return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+static void
+xfs_mount_setup_inode_geom(
+ struct xfs_mount *mp)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+
+ igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
+ ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
+
+ xfs_ialloc_setup_geometry(mp);
+}
+
+/* Compute maximum possible height for per-AG btree types for this fs. */
+static inline void
+xfs_agbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ unsigned int levels;
+
+ levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
+ levels = max(levels, mp->m_rmap_maxlevels);
+ mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
/*
@@ -674,29 +649,13 @@ xfs_mountfs(
xfs_warn(mp, "correcting sb_features alignment problem");
sbp->sb_features2 |= sbp->sb_bad_features2;
mp->m_update_sb = true;
-
- /*
- * Re-check for ATTR2 in case it was found in bad_features2
- * slot.
- */
- if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- !(mp->m_flags & XFS_MOUNT_NOATTR2))
- mp->m_flags |= XFS_MOUNT_ATTR2;
}
- if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- (mp->m_flags & XFS_MOUNT_NOATTR2)) {
- xfs_sb_version_removeattr2(&mp->m_sb);
- mp->m_update_sb = true;
-
- /* update sb_versionnum for the clearing of the morebits */
- if (!sbp->sb_features2)
- mp->m_update_sb = true;
- }
/* always use v2 inodes by default now */
if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ mp->m_features |= XFS_FEAT_NLINK;
mp->m_update_sb = true;
}
@@ -713,10 +672,12 @@ xfs_mountfs(
xfs_alloc_compute_maxlevels(mp);
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
- xfs_ialloc_setup_geometry(mp);
+ xfs_mount_setup_inode_geom(mp);
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ xfs_agbtree_compute_maxlevels(mp);
+
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
* is NOT aligned turn off m_dalign since allocator alignment is within
@@ -769,7 +730,7 @@ xfs_mountfs(
* cluster size. Full inode chunk alignment must match the chunk size,
* but that is checked on sb read verification...
*/
- if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+ if (xfs_has_sparseinodes(mp) &&
mp->m_sb.sb_spino_align !=
XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
xfs_warn(mp,
@@ -819,7 +780,8 @@ xfs_mountfs(
/*
* Allocate and initialize the per-ag data.
*/
- error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
+ &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
goto out_free_dir;
@@ -831,6 +793,10 @@ xfs_mountfs(
goto out_free_perag;
}
+ error = xfs_inodegc_register_shrinker(mp);
+ if (error)
+ goto out_fail_wait;
+
/*
* Log's mount-time initialization. The first part of recovery can place
* some items on the AIL, to be handled when recovery is finished or
@@ -841,13 +807,25 @@ xfs_mountfs(
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
if (error) {
xfs_warn(mp, "log mount failed");
- goto out_fail_wait;
+ goto out_inodegc_shrinker;
}
- /* Make sure the summary counts are ok. */
- error = xfs_check_summary_counts(mp);
- if (error)
- goto out_log_dealloc;
+ /* Enable background inode inactivation workers. */
+ xfs_inodegc_start(mp);
+ xfs_blockgc_start(mp);
+
+ /*
+ * Now that we've recovered any pending superblock feature bit
+ * additions, we can finish setting up the attr2 behaviour for the
+ * mount. The noattr2 option overrides the superblock flag, so only
+ * check the superblock feature flag if the mount option is not set.
+ */
+ if (xfs_has_noattr2(mp)) {
+ mp->m_features &= ~XFS_FEAT_ATTR2;
+ } else if (!xfs_has_attr2(mp) &&
+ (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
+ mp->m_features |= XFS_FEAT_ATTR2;
+ }
/*
* Get and sanity-check the root inode.
@@ -887,12 +865,17 @@ xfs_mountfs(
goto out_rele_rip;
}
+ /* Make sure the summary counts are ok. */
+ error = xfs_check_summary_counts(mp);
+ if (error)
+ goto out_rtunmount;
+
/*
* If this is a read-only mount defer the superblock updates until
* the next remount into writeable mode. Otherwise we would never
* perform the update e.g. for the root filesystem.
*/
- if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ if (mp->m_update_sb && !xfs_is_readonly(mp)) {
error = xfs_sync_sb(mp, false);
if (error) {
xfs_warn(mp, "failed to write sb changes");
@@ -903,13 +886,11 @@ xfs_mountfs(
/*
* Initialise the XFS quota management subsystem for this mount
*/
- if (XFS_IS_QUOTA_RUNNING(mp)) {
+ if (XFS_IS_QUOTA_ON(mp)) {
error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
if (error)
goto out_rtunmount;
} else {
- ASSERT(!XFS_IS_QUOTA_ON(mp));
-
/*
* If a file system had quotas running earlier, but decided to
* mount without -o uquota/pquota/gquota options, revoke the
@@ -926,9 +907,17 @@ xfs_mountfs(
/*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
- * read in.
+ * read in. Temporarily create per-AG space reservations for metadata
+ * btree shape changes because space freeing transactions (for inode
+ * inactivation) require the per-AG reservation in lieu of reserving
+ * blocks.
*/
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error == -ENOSPC)
+ xfs_warn(mp,
+ "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
error = xfs_log_mount_finish(mp);
+ xfs_fs_unreserve_ag_blocks(mp);
if (error) {
xfs_warn(mp, "log mount finish failed");
goto out_rtunmount;
@@ -943,10 +932,8 @@ xfs_mountfs(
* We use the same quiesce mechanism as the rw->ro remount, as they are
* semantically identical operations.
*/
- if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
- XFS_MOUNT_RDONLY) {
- xfs_quiesce_attr(mp);
- }
+ if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
+ xfs_log_clean(mp);
/*
* Complete the quota initialisation, post-log-replay component.
@@ -969,22 +956,13 @@ xfs_mountfs(
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ if (!xfs_is_readonly(mp)) {
resblks = xfs_default_resblks(mp);
error = xfs_reserve_blocks(mp, &resblks, NULL);
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
- /* Recover any CoW blocks that never got remapped. */
- error = xfs_reflink_recover_cow(mp);
- if (error) {
- xfs_err(mp,
- "Error %d recovering leftover CoW allocations.", error);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- goto out_quota;
- }
-
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
@@ -995,7 +973,6 @@ xfs_mountfs(
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
- out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
@@ -1003,8 +980,17 @@ xfs_mountfs(
xfs_irele(rip);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
+
+ /*
+ * Inactivate all inodes that might still be in memory after a log
+ * intent recovery failure so that reclaim can free them. Metadata
+ * inodes and the root directory shouldn't need inactivation, but the
+ * mount failed for some reason, so pull down all the state and flee.
+ */
+ xfs_inodegc_flush(mp);
+
/*
- * Cancel all delayed reclaim work and reclaim the inodes directly.
+ * Flush all inode reclamation work and flush the log.
* We have to do this /after/ rtunmount and qm_unmount because those
* two will have scheduled delayed reclaim for the rt/quota inodes.
*
@@ -1014,16 +1000,15 @@ xfs_mountfs(
* qm_unmount_quotas and therefore rely on qm_unmount to release the
* quota inodes.
*/
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
- xfs_health_unmount(mp);
+ xfs_unmount_flush_inodes(mp);
out_log_dealloc:
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
+ out_inodegc_shrinker:
+ unregister_shrinker(&mp->m_inodegc_shrinker);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_wait_buftarg(mp->m_logdev_targp);
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_drain(mp->m_logdev_targp);
+ xfs_buftarg_drain(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
out_free_dir:
@@ -1053,53 +1038,23 @@ xfs_unmountfs(
uint64_t resblks;
int error;
- xfs_stop_block_reaping(mp);
+ /*
+ * Perform all on-disk metadata updates required to inactivate inodes
+ * that the VFS evicted earlier in the unmount process. Freeing inodes
+ * and discarding CoW fork preallocations can cause shape changes to
+ * the free inode and refcount btrees, respectively, so we must finish
+ * this before we discard the metadata space reservations. Metadata
+ * inodes and the root directory do not require inactivation.
+ */
+ xfs_inodegc_flush(mp);
+
+ xfs_blockgc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
- /*
- * We can potentially deadlock here if we have an inode cluster
- * that has been freed has its buffer still pinned in memory because
- * the transaction is still sitting in a iclog. The stale inodes
- * on that buffer will have their flush locks held until the
- * transaction hits the disk and the callbacks run. the inode
- * flush takes the flush lock unconditionally and with nothing to
- * push out the iclog we will never get that unlocked. hence we
- * need to force the log first.
- */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /*
- * Wait for all busy extents to be freed, including completion of
- * any discard operation.
- */
- xfs_extent_busy_wait_all(mp);
- flush_workqueue(xfs_discard_wq);
-
- /*
- * We now need to tell the world we are unmounting. This will allow
- * us to detect that the filesystem is going away and we should error
- * out anything that we have been retrying in the background. This will
- * prevent neverending retries in AIL pushing from hanging the unmount.
- */
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
-
- /*
- * Flush all pending changes from the AIL.
- */
- xfs_ail_push_all_sync(mp->m_ail);
-
- /*
- * And reclaim all inodes. At this point there should be no dirty
- * inodes and none should be pinned or locked, but use synchronous
- * reclaim just to be sure. We can stop background inode reclaim
- * here as well if it is still running.
- */
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
- xfs_health_unmount(mp);
+ xfs_unmount_flush_inodes(mp);
xfs_qm_unmount(mp);
@@ -1123,12 +1078,6 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "Unable to update superblock counters. "
- "Freespace may not be correct on next mount.");
-
-
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1136,6 +1085,7 @@ xfs_unmountfs(
#if defined(DEBUG)
xfs_errortag_clearall(mp);
#endif
+ unregister_shrinker(&mp->m_inodegc_shrinker);
xfs_free_perag(mp);
xfs_errortag_del(mp);
@@ -1157,96 +1107,39 @@ xfs_fs_writable(
{
ASSERT(level > SB_UNFROZEN);
if ((mp->m_super->s_writers.frozen >= level) ||
- XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
+ xfs_is_shutdown(mp) || xfs_is_readonly(mp))
return false;
return true;
}
-/*
- * xfs_log_sbcount
- *
- * Sync the superblock counters to disk.
- *
- * Note this code can be called during the process of freezing, so we use the
- * transaction allocator that does not block when the transaction subsystem is
- * in its frozen state.
- */
-int
-xfs_log_sbcount(xfs_mount_t *mp)
-{
- /* allow this to proceed during the freeze sequence... */
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
- return 0;
-
- /*
- * we don't need to do this if we are updating the superblock
- * counters on every modification.
- */
- if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
- return 0;
-
- return xfs_sync_sb(mp, true);
-}
-
-/*
- * Deltas for the inode count are +/-64, hence we use a large batch size
- * of 128 so we don't need to take the counter lock on every update.
- */
-#define XFS_ICOUNT_BATCH 128
-int
-xfs_mod_icount(
- struct xfs_mount *mp,
- int64_t delta)
-{
- percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
- if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
- ASSERT(0);
- percpu_counter_add(&mp->m_icount, -delta);
- return -EINVAL;
- }
- return 0;
-}
-
+/* Adjust m_fdblocks or m_frextents. */
int
-xfs_mod_ifree(
- struct xfs_mount *mp,
- int64_t delta)
-{
- percpu_counter_add(&mp->m_ifree, delta);
- if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
- ASSERT(0);
- percpu_counter_add(&mp->m_ifree, -delta);
- return -EINVAL;
- }
- return 0;
-}
-
-/*
- * Deltas for the block count can vary from 1 to very large, but lock contention
- * only occurs on frequent small block count updates such as in the delayed
- * allocation path for buffered writes (page a time updates). Hence we set
- * a large batch count (1024) to minimise global counter updates except when
- * we get near to ENOSPC and we have to be very accurate with our updates.
- */
-#define XFS_FDBLOCKS_BATCH 1024
-int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
struct xfs_mount *mp,
+ struct percpu_counter *counter,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
+ uint64_t set_aside = 0;
s32 batch;
+ bool has_resv_pool;
+
+ ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+ has_resv_pool = (counter == &mp->m_fdblocks);
+ if (rsvd)
+ ASSERT(has_resv_pool);
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
- if (likely(mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(&mp->m_fdblocks, delta);
+ if (likely(!has_resv_pool ||
+ mp->m_resblks == mp->m_resblks_avail)) {
+ percpu_counter_add(counter, delta);
return 0;
}
@@ -1258,7 +1151,7 @@ xfs_mod_fdblocks(
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(&mp->m_fdblocks, delta);
+ percpu_counter_add(counter, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
@@ -1272,14 +1165,27 @@ xfs_mod_fdblocks(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
batch = XFS_FDBLOCKS_BATCH;
- percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
- if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
+ /*
+ * Set aside allocbt blocks because these blocks are tracked as free
+ * space but not available for allocation. Technically this means that a
+ * single reservation cannot consume all remaining free space, but the
+ * ratio of allocbt blocks to usable free blocks should be rather small.
+ * The tradeoff without this is that filesystems that maintain high
+ * perag block reservations can over reserve physical block availability
+ * and fail physical allocation, which leads to much more serious
+ * problems (i.e. transaction abort, pagecache discards, etc.) than
+ * slightly premature -ENOSPC.
+ */
+ if (has_resv_pool)
+ set_aside = xfs_fdblocks_unavailable(mp);
+ percpu_counter_add_batch(counter, delta, batch);
+ if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
@@ -1290,8 +1196,8 @@ xfs_mod_fdblocks(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(&mp->m_fdblocks, -delta);
- if (!rsvd)
+ percpu_counter_add(counter, -delta);
+ if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1300,50 +1206,14 @@ xfs_mod_fdblocks(
spin_unlock(&mp->m_sb_lock);
return 0;
}
- printk_once(KERN_WARNING
- "Filesystem \"%s\": reserve blocks depleted! "
- "Consider increasing reserve pool size.",
- mp->m_super->s_id);
+ xfs_warn_once(mp,
+"Reserve blocks depleted! Consider increasing reserve pool size.");
+
fdblocks_enospc:
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
}
-int
-xfs_mod_frextents(
- struct xfs_mount *mp,
- int64_t delta)
-{
- int64_t lcounter;
- int ret = 0;
-
- spin_lock(&mp->m_sb_lock);
- lcounter = mp->m_sb.sb_frextents + delta;
- if (lcounter < 0)
- ret = -ENOSPC;
- else
- mp->m_sb.sb_frextents = lcounter;
- spin_unlock(&mp->m_sb_lock);
- return ret;
-}
-
-/*
- * xfs_getsb() is called to obtain the buffer for the superblock.
- * The buffer is returned locked and read in from disk.
- * The buffer should be released with a call to xfs_brelse().
- */
-struct xfs_buf *
-xfs_getsb(
- struct xfs_mount *mp)
-{
- struct xfs_buf *bp = mp->m_sb_bp;
-
- xfs_buf_lock(bp);
- xfs_buf_hold(bp);
- ASSERT(bp->b_flags & XBF_DONE);
- return bp;
-}
-
/*
* Used to free the superblock along various error paths.
*/
@@ -1382,13 +1252,122 @@ void
xfs_force_summary_recalc(
struct xfs_mount *mp)
{
- if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (!xfs_has_lazysbcount(mp))
return;
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
}
/*
+ * Enable a log incompat feature flag in the primary superblock. The caller
+ * cannot have any other transactions in progress.
+ */
+int
+xfs_add_incompat_log_feature(
+ struct xfs_mount *mp,
+ uint32_t feature)
+{
+ struct xfs_dsb *dsb;
+ int error;
+
+ ASSERT(hweight32(feature) == 1);
+ ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+
+ /*
+ * Force the log to disk and kick the background AIL thread to reduce
+ * the chances that the bwrite will stall waiting for the AIL to unpin
+ * the primary superblock buffer. This isn't a data integrity
+ * operation, so we don't need a synchronous push.
+ */
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+ xfs_ail_push_all(mp->m_ail);
+
+ /*
+ * Lock the primary superblock buffer to serialize all callers that
+ * are trying to set feature bits.
+ */
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_hold(mp->m_sb_bp);
+
+ if (xfs_is_shutdown(mp)) {
+ error = -EIO;
+ goto rele;
+ }
+
+ if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
+ goto rele;
+
+ /*
+ * Write the primary superblock to disk immediately, because we need
+ * the log_incompat bit to be set in the primary super now to protect
+ * the log items that we're going to commit later.
+ */
+ dsb = mp->m_sb_bp->b_addr;
+ xfs_sb_to_disk(dsb, &mp->m_sb);
+ dsb->sb_features_log_incompat |= cpu_to_be32(feature);
+ error = xfs_bwrite(mp->m_sb_bp);
+ if (error)
+ goto shutdown;
+
+ /*
+ * Add the feature bits to the incore superblock before we unlock the
+ * buffer.
+ */
+ xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
+ xfs_buf_relse(mp->m_sb_bp);
+
+ /* Log the superblock to disk. */
+ return xfs_sync_sb(mp, false);
+shutdown:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+rele:
+ xfs_buf_relse(mp->m_sb_bp);
+ return error;
+}
+
+/*
+ * Clear all the log incompat flags from the superblock.
+ *
+ * The caller cannot be in a transaction, must ensure that the log does not
+ * contain any log items protected by any log incompat bit, and must ensure
+ * that there are no other threads that depend on the state of the log incompat
+ * feature flags in the primary super.
+ *
+ * Returns true if the superblock is dirty.
+ */
+bool
+xfs_clear_incompat_log_features(
+ struct xfs_mount *mp)
+{
+ bool ret = false;
+
+ if (!xfs_has_crc(mp) ||
+ !xfs_sb_has_incompat_log_feature(&mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
+ xfs_is_shutdown(mp))
+ return false;
+
+ /*
+ * Update the incore superblock. We synchronize on the primary super
+ * buffer lock to be consistent with the add function, though at least
+ * in theory this shouldn't be necessary.
+ */
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_hold(mp->m_sb_bp);
+
+ if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
+ xfs_sb_remove_incompat_log_features(&mp->m_sb);
+ ret = true;
+ }
+
+ xfs_buf_relse(mp->m_sb_bp);
+ return ret;
+}
+
+/*
* Update the in-core delayed block counter.
*
* We prefer to update the counter without having to take a spinlock for every
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 88ab09ed29e7..8aca2cc173ac 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -12,6 +12,7 @@ struct xfs_mru_cache;
struct xfs_ail;
struct xfs_quotainfo;
struct xfs_da_geometry;
+struct xfs_perag;
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
@@ -55,61 +56,37 @@ struct xfs_error_cfg {
long retry_timeout; /* in jiffies, -1 = infinite */
};
-typedef struct xfs_mount {
- struct super_block *m_super;
-
- /*
- * Bitsets of per-fs metadata that have been checked and/or are sick.
- * Callers must hold m_sb_lock to access these two fields.
- */
- uint8_t m_fs_checked;
- uint8_t m_fs_sick;
- /*
- * Bitsets of rt metadata that have been checked and/or are sick.
- * Callers must hold m_sb_lock to access this field.
- */
- uint8_t m_rt_checked;
- uint8_t m_rt_sick;
+/*
+ * Per-cpu deferred inode inactivation GC lists.
+ */
+struct xfs_inodegc {
+ struct llist_head list;
+ struct delayed_work work;
- struct xfs_ail *m_ail; /* fs active log item list */
+ /* approximate count of inodes in the list */
+ unsigned int items;
+ unsigned int shrinker_hits;
+};
+/*
+ * The struct xfsmount layout is optimised to separate read-mostly variables
+ * from variables that are frequently modified. We put the read-mostly variables
+ * first, then place all the other variables at the end.
+ *
+ * Typically, read-mostly variables are those that are set at mount time and
+ * never changed again, or only change rarely as a result of things like sysfs
+ * knobs being tweaked.
+ */
+typedef struct xfs_mount {
struct xfs_sb m_sb; /* copy of fs superblock */
- spinlock_t m_sb_lock; /* sb counter lock */
- struct percpu_counter m_icount; /* allocated inodes counter */
- struct percpu_counter m_ifree; /* free inodes counter */
- struct percpu_counter m_fdblocks; /* free block counter */
- /*
- * Count of data device blocks reserved for delayed allocations,
- * including indlen blocks. Does not include allocated CoW staging
- * extents or anything related to the rt device.
- */
- struct percpu_counter m_delalloc_blks;
-
+ struct super_block *m_super;
+ struct xfs_ail *m_ail; /* fs active log item list */
struct xfs_buf *m_sb_bp; /* buffer for superblock */
char *m_rtname; /* realtime device name */
char *m_logname; /* external log device name */
- int m_bsize; /* fs logical block size */
- xfs_agnumber_t m_agfrotor; /* last ag where space found */
- xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
- spinlock_t m_agirotor_lock;/* .. and lock protecting it */
- xfs_agnumber_t m_maxagi; /* highest inode alloc group */
- uint m_allocsize_log;/* min write size log bytes */
- uint m_allocsize_blocks; /* min write size blocks */
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
- struct xfs_ino_geometry m_ino_geo; /* inode geometry */
- int m_logbufs; /* number of log buffers */
- int m_logbsize; /* size of each log buffer */
- uint m_rsumlevels; /* rt summary levels */
- uint m_rsumsize; /* size of rt summary, bytes */
- /*
- * Optional cache of rt summary level per bitmap block with the
- * invariant that m_rsum_cache[bbno] <= the minimum i for which
- * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
- * inode lock.
- */
- uint8_t *m_rsum_cache;
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
@@ -117,9 +94,29 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ struct list_head m_mount_list; /* global mount list */
+ void __percpu *m_inodegc; /* percpu inodegc structures */
+
+ /*
+ * Optional cache of rt summary level per bitmap block with the
+ * invariant that m_rsum_cache[bbno] <= the minimum i for which
+ * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
+ * inode lock.
+ */
+ uint8_t *m_rsum_cache;
+ struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
+ struct workqueue_struct *m_buf_workqueue;
+ struct workqueue_struct *m_unwritten_workqueue;
+ struct workqueue_struct *m_reclaim_workqueue;
+ struct workqueue_struct *m_sync_workqueue;
+ struct workqueue_struct *m_blockgc_wq;
+ struct workqueue_struct *m_inodegc_wq;
+
+ int m_bsize; /* fs logical block size */
uint8_t m_blkbit_log; /* blocklog + NBBY */
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
+ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -131,49 +128,98 @@ typedef struct xfs_mount {
uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
- uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
- uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+ uint m_alloc_maxlevels; /* max alloc btree levels */
+ uint m_bm_maxlevels[2]; /* max bmap btree levels */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
+ unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
- struct radix_tree_root m_perag_tree; /* per-ag accounting info */
- spinlock_t m_perag_lock; /* lock for m_perag_tree */
- struct mutex m_growlock; /* growfs mutex */
+ int m_dalign; /* stripe unit */
+ int m_swidth; /* stripe width */
+ xfs_agnumber_t m_maxagi; /* highest inode alloc group */
+ uint m_allocsize_log;/* min write size log bytes */
+ uint m_allocsize_blocks; /* min write size blocks */
+ int m_logbufs; /* number of log buffers */
+ int m_logbsize; /* size of each log buffer */
+ uint m_rsumlevels; /* rt summary levels */
+ uint m_rsumsize; /* size of rt summary, bytes */
int m_fixedfsid[2]; /* unchanged for life of FS */
- uint64_t m_flags; /* global mount flags */
- bool m_finobt_nores; /* no per-AG finobt resv. */
uint m_qflags; /* quota status flags */
+ uint64_t m_features; /* active filesystem features */
+ uint64_t m_low_space[XFS_LOWSP_MAX];
+ uint64_t m_low_rtexts[XFS_LOWSP_MAX];
+ struct xfs_ino_geometry m_ino_geo; /* inode geometry */
struct xfs_trans_resv m_resv; /* precomputed res values */
+ /* low free space thresholds */
+ unsigned long m_opstate; /* dynamic state flags */
+ bool m_always_cow;
+ bool m_fail_unmount;
+ bool m_finobt_nores; /* no per-AG finobt resv. */
+ bool m_update_sb; /* sb needs update in mount */
+
+ /*
+ * Bitsets of per-fs metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access these two fields.
+ */
+ uint8_t m_fs_checked;
+ uint8_t m_fs_sick;
+ /*
+ * Bitsets of rt metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access this field.
+ */
+ uint8_t m_rt_checked;
+ uint8_t m_rt_sick;
+
+ /*
+ * End of read-mostly variables. Frequently written variables and locks
+ * should be placed below this comment from now on. The first variable
+ * here is marked as cacheline aligned so they it is separated from
+ * the read-mostly variables.
+ */
+
+ spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
+ struct percpu_counter m_icount; /* allocated inodes counter */
+ struct percpu_counter m_ifree; /* free inodes counter */
+ struct percpu_counter m_fdblocks; /* free block counter */
+ struct percpu_counter m_frextents; /* free rt extent counter */
+
+ /*
+ * Count of data device blocks reserved for delayed allocations,
+ * including indlen blocks. Does not include allocated CoW staging
+ * extents or anything related to the rt device.
+ */
+ struct percpu_counter m_delalloc_blks;
+ /*
+ * Global count of allocation btree blocks in use across all AGs. Only
+ * used when perag reservation is enabled. Helps prevent block
+ * reservation from attempting to reserve allocation btree blocks.
+ */
+ atomic64_t m_allocbt_blks;
+
+ struct radix_tree_root m_perag_tree; /* per-ag accounting info */
+ spinlock_t m_perag_lock; /* lock for m_perag_tree */
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
- int m_dalign; /* stripe unit */
- int m_swidth; /* stripe width */
- uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
- atomic_t m_active_trans; /* number trans frozen */
- struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct delayed_work m_reclaim_work; /* background inode reclaim */
- struct delayed_work m_eofblocks_work; /* background eof blocks
- trimming */
- struct delayed_work m_cowblocks_work; /* background cow blocks
- trimming */
- bool m_update_sb; /* sb needs update in mount */
- int64_t m_low_space[XFS_LOWSP_MAX];
- /* low free space thresholds */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+ xfs_agnumber_t m_agfrotor; /* last ag where space found */
+ xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
+ spinlock_t m_agirotor_lock;/* .. and lock protecting it */
- struct workqueue_struct *m_buf_workqueue;
- struct workqueue_struct *m_unwritten_workqueue;
- struct workqueue_struct *m_cil_workqueue;
- struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_eofblocks_workqueue;
- struct workqueue_struct *m_sync_workqueue;
+ /* Memory shrinker to throttle and reprioritize inodegc */
+ struct shrinker m_inodegc_shrinker;
+ /*
+ * Workqueue item so that we can coalesce multiple inode flush attempts
+ * into a single flush.
+ */
+ struct work_struct m_flush_inodes_work;
/*
* Generation of the filesysyem layout. This is incremented by each
@@ -185,9 +231,8 @@ typedef struct xfs_mount {
* to various other kinds of pain inflicted on the pNFS server.
*/
uint32_t m_generation;
+ struct mutex m_growlock; /* growfs mutex */
- bool m_always_cow;
- bool m_fail_unmount;
#ifdef DEBUG
/*
* Frequency with which errors are injected. Replaces xfs_etest; the
@@ -202,38 +247,196 @@ typedef struct xfs_mount {
#define M_IGEO(mp) (&(mp)->m_ino_geo)
/*
- * Flags for m_flags.
+ * Flags for m_features.
+ *
+ * These are all the active features in the filesystem, regardless of how
+ * they are configured.
*/
-#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
- must be synchronous except
- for space allocations */
-#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
-#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
-#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
- operations, typically for
- disk errors in metadata */
-#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
-#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
- allocations */
-#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
-#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
-#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
-#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */
-#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
-#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
-#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
-#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
-#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
- * allocation */
-#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
-#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
-#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred
+#define XFS_FEAT_ATTR (1ULL << 0) /* xattrs present in fs */
+#define XFS_FEAT_NLINK (1ULL << 1) /* 32 bit link counts */
+#define XFS_FEAT_QUOTA (1ULL << 2) /* quota active */
+#define XFS_FEAT_ALIGN (1ULL << 3) /* inode alignment */
+#define XFS_FEAT_DALIGN (1ULL << 4) /* data alignment */
+#define XFS_FEAT_LOGV2 (1ULL << 5) /* version 2 logs */
+#define XFS_FEAT_SECTOR (1ULL << 6) /* sector size > 512 bytes */
+#define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */
+#define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */
+#define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */
+#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */
+#define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */
+#define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */
+#define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */
+#define XFS_FEAT_V3INODES (1ULL << 14) /* Version 3 inodes */
+#define XFS_FEAT_PQUOTINO (1ULL << 15) /* non-shared proj/grp quotas */
+#define XFS_FEAT_FTYPE (1ULL << 16) /* inode type in dir */
+#define XFS_FEAT_FINOBT (1ULL << 17) /* free inode btree */
+#define XFS_FEAT_RMAPBT (1ULL << 18) /* reverse map btree */
+#define XFS_FEAT_REFLINK (1ULL << 19) /* reflinked files */
+#define XFS_FEAT_SPINODES (1ULL << 20) /* sparse inode chunks */
+#define XFS_FEAT_META_UUID (1ULL << 21) /* metadata UUID */
+#define XFS_FEAT_REALTIME (1ULL << 22) /* realtime device present */
+#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
+#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
+#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
+#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
+
+/* Mount features */
+#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
+#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
+#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
+#define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred
* I/O size in stat() */
-#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
- allocator */
-#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
+#define XFS_FEAT_WSYNC (1ULL << 52) /* synchronous metadata ops */
+#define XFS_FEAT_DIRSYNC (1ULL << 53) /* synchronous directory ops */
+#define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */
+#define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */
+#define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */
+#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/
+#define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */
+#define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */
+#define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */
+#define XFS_FEAT_DAX_NEVER (1ULL << 61) /* DAX never enabled */
+#define XFS_FEAT_NORECOVERY (1ULL << 62) /* no recovery - dirty fs */
+#define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */
+
+#define __XFS_HAS_FEAT(name, NAME) \
+static inline bool xfs_has_ ## name (struct xfs_mount *mp) \
+{ \
+ return mp->m_features & XFS_FEAT_ ## NAME; \
+}
+
+/* Some features can be added dynamically so they need a set wrapper, too. */
+#define __XFS_ADD_FEAT(name, NAME) \
+ __XFS_HAS_FEAT(name, NAME); \
+static inline void xfs_add_ ## name (struct xfs_mount *mp) \
+{ \
+ mp->m_features |= XFS_FEAT_ ## NAME; \
+ xfs_sb_version_add ## name(&mp->m_sb); \
+}
+
+/* Superblock features */
+__XFS_ADD_FEAT(attr, ATTR)
+__XFS_HAS_FEAT(nlink, NLINK)
+__XFS_ADD_FEAT(quota, QUOTA)
+__XFS_HAS_FEAT(align, ALIGN)
+__XFS_HAS_FEAT(dalign, DALIGN)
+__XFS_HAS_FEAT(logv2, LOGV2)
+__XFS_HAS_FEAT(sector, SECTOR)
+__XFS_HAS_FEAT(extflg, EXTFLG)
+__XFS_HAS_FEAT(asciici, ASCIICI)
+__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT)
+__XFS_ADD_FEAT(attr2, ATTR2)
+__XFS_HAS_FEAT(parent, PARENT)
+__XFS_ADD_FEAT(projid32, PROJID32)
+__XFS_HAS_FEAT(crc, CRC)
+__XFS_HAS_FEAT(v3inodes, V3INODES)
+__XFS_HAS_FEAT(pquotino, PQUOTINO)
+__XFS_HAS_FEAT(ftype, FTYPE)
+__XFS_HAS_FEAT(finobt, FINOBT)
+__XFS_HAS_FEAT(rmapbt, RMAPBT)
+__XFS_HAS_FEAT(reflink, REFLINK)
+__XFS_HAS_FEAT(sparseinodes, SPINODES)
+__XFS_HAS_FEAT(metauuid, META_UUID)
+__XFS_HAS_FEAT(realtime, REALTIME)
+__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
+__XFS_HAS_FEAT(bigtime, BIGTIME)
+__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
+
+/*
+ * Mount features
+ *
+ * These do not change dynamically - features that can come and go, such as 32
+ * bit inodes and read-only state, are kept as operational state rather than
+ * features.
+ */
+__XFS_HAS_FEAT(noattr2, NOATTR2)
+__XFS_HAS_FEAT(noalign, NOALIGN)
+__XFS_HAS_FEAT(allocsize, ALLOCSIZE)
+__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
+__XFS_HAS_FEAT(wsync, WSYNC)
+__XFS_HAS_FEAT(dirsync, DIRSYNC)
+__XFS_HAS_FEAT(discard, DISCARD)
+__XFS_HAS_FEAT(grpid, GRPID)
+__XFS_HAS_FEAT(small_inums, SMALL_INUMS)
+__XFS_HAS_FEAT(ikeep, IKEEP)
+__XFS_HAS_FEAT(swalloc, SWALLOC)
+__XFS_HAS_FEAT(filestreams, FILESTREAMS)
+__XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
+__XFS_HAS_FEAT(dax_never, DAX_NEVER)
+__XFS_HAS_FEAT(norecovery, NORECOVERY)
+__XFS_HAS_FEAT(nouuid, NOUUID)
-#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
+/*
+ * Operational mount state flags
+ *
+ * Use these with atomic bit ops only!
+ */
+#define XFS_OPSTATE_UNMOUNTING 0 /* filesystem is unmounting */
+#define XFS_OPSTATE_CLEAN 1 /* mount was clean */
+#define XFS_OPSTATE_SHUTDOWN 2 /* stop all fs operations */
+#define XFS_OPSTATE_INODE32 3 /* inode32 allocator active */
+#define XFS_OPSTATE_READONLY 4 /* read-only fs */
+
+/*
+ * If set, inactivation worker threads will be scheduled to process queued
+ * inodegc work. If not, queued inodes remain in memory waiting to be
+ * processed.
+ */
+#define XFS_OPSTATE_INODEGC_ENABLED 5
+/*
+ * If set, background speculative prealloc gc worker threads will be scheduled
+ * to process queued blockgc work. If not, inodes retain their preallocations
+ * until explicitly deleted.
+ */
+#define XFS_OPSTATE_BLOCKGC_ENABLED 6
+
+/* Kernel has logged a warning about online fsck being used on this fs. */
+#define XFS_OPSTATE_WARNED_SCRUB 7
+/* Kernel has logged a warning about shrink being used on this fs. */
+#define XFS_OPSTATE_WARNED_SHRINK 8
+/* Kernel has logged a warning about logged xattr updates being used. */
+#define XFS_OPSTATE_WARNED_LARP 9
+
+#define __XFS_IS_OPSTATE(name, NAME) \
+static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
+{ \
+ return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+} \
+static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \
+{ \
+ return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+} \
+static inline bool xfs_set_ ## name (struct xfs_mount *mp) \
+{ \
+ return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \
+}
+
+__XFS_IS_OPSTATE(unmounting, UNMOUNTING)
+__XFS_IS_OPSTATE(clean, CLEAN)
+__XFS_IS_OPSTATE(shutdown, SHUTDOWN)
+__XFS_IS_OPSTATE(inode32, INODE32)
+__XFS_IS_OPSTATE(readonly, READONLY)
+__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
+__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+
+static inline bool
+xfs_should_warn(struct xfs_mount *mp, long nr)
+{
+ return !test_and_set_bit(nr, &mp->m_opstate);
+}
+
+#define XFS_OPSTATE_STRINGS \
+ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \
+ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \
+ { (1UL << XFS_OPSTATE_SHUTDOWN), "shutdown" }, \
+ { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \
+ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
+ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
+ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
+ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
+ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
+ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
/*
* Max and min values for mount-option defined I/O
@@ -242,20 +445,22 @@ typedef struct xfs_mount {
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
- ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
-#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
int lnnum);
#define xfs_force_shutdown(m,f) \
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
-#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
+#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */
+#define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */
+
+#define XFS_SHUTDOWN_STRINGS \
+ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
+ { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \
+ { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \
+ { SHUTDOWN_CORRUPT_INCORE, "corruption" }
/*
* Flags for xfs_mountfs
@@ -277,125 +482,53 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
-/* per-AG block reservation data structures*/
-struct xfs_ag_resv {
- /* number of blocks originally reserved here */
- xfs_extlen_t ar_orig_reserved;
- /* number of blocks reserved here */
- xfs_extlen_t ar_reserved;
- /* number of blocks originally asked for */
- xfs_extlen_t ar_asked;
-};
+int xfs_buf_hash_init(struct xfs_perag *pag);
+void xfs_buf_hash_destroy(struct xfs_perag *pag);
+
+extern void xfs_uuid_table_free(void);
+extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
+extern int xfs_mountfs(xfs_mount_t *mp);
+extern void xfs_unmountfs(xfs_mount_t *);
/*
- * Per-ag incore structure, copies of information in agf and agi, to improve the
- * performance of allocation group selection.
+ * Deltas for the block count can vary from 1 to very large, but lock contention
+ * only occurs on frequent small block count updates such as in the delayed
+ * allocation path for buffered writes (page a time updates). Hence we set
+ * a large batch count (1024) to minimise global counter updates except when
+ * we get near to ENOSPC and we have to be very accurate with our updates.
*/
-typedef struct xfs_perag {
- struct xfs_mount *pag_mount; /* owner filesystem */
- xfs_agnumber_t pag_agno; /* AG this structure belongs to */
- atomic_t pag_ref; /* perag reference count */
- char pagf_init; /* this agf's entry is initialized */
- char pagi_init; /* this agi's entry is initialized */
- char pagf_metadata; /* the agf is preferred to be metadata */
- char pagi_inodeok; /* The agi is ok for inodes */
- uint8_t pagf_levels[XFS_BTNUM_AGF];
- /* # of levels in bno & cnt btree */
- bool pagf_agflreset; /* agfl requires reset before use */
- uint32_t pagf_flcount; /* count of blocks in freelist */
- xfs_extlen_t pagf_freeblks; /* total free blocks */
- xfs_extlen_t pagf_longest; /* longest free space */
- uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
- xfs_agino_t pagi_freecount; /* number of free inodes */
- xfs_agino_t pagi_count; /* number of allocated inodes */
-
- /*
- * Inode allocation search lookup optimisation.
- * If the pagino matches, the search for new inodes
- * doesn't need to search the near ones again straight away
- */
- xfs_agino_t pagl_pagino;
- xfs_agino_t pagl_leftrec;
- xfs_agino_t pagl_rightrec;
+#define XFS_FDBLOCKS_BATCH 1024
- /*
- * Bitsets of per-ag metadata that have been checked and/or are sick.
- * Callers should hold pag_state_lock before accessing this field.
- */
- uint16_t pag_checked;
- uint16_t pag_sick;
- spinlock_t pag_state_lock;
-
- spinlock_t pagb_lock; /* lock for pagb_tree */
- struct rb_root pagb_tree; /* ordered tree of busy extents */
- unsigned int pagb_gen; /* generation count for pagb_tree */
- wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
-
- atomic_t pagf_fstrms; /* # of filestreams active in this AG */
-
- spinlock_t pag_ici_lock; /* incore inode cache lock */
- struct radix_tree_root pag_ici_root; /* incore inode cache root */
- int pag_ici_reclaimable; /* reclaimable inodes */
- struct mutex pag_ici_reclaim_lock; /* serialisation point */
- unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
-
- /* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
- struct rhashtable pag_buf_hash;
-
- /* for rcu-safe freeing */
- struct rcu_head rcu_head;
- int pagb_count; /* pagb slots in use */
-
- /* Blocks reserved for all kinds of metadata. */
- struct xfs_ag_resv pag_meta_resv;
- /* Blocks reserved for the reverse mapping btree. */
- struct xfs_ag_resv pag_rmapbt_resv;
-
- /* reference count */
- uint8_t pagf_refcount_level;
-
- /*
- * Unlinked inode information. This incore information reflects
- * data stored in the AGI, so callers must hold the AGI buffer lock
- * or have some other means to control concurrency.
- */
- struct rhashtable pagi_unlinked_hash;
-} xfs_perag_t;
-
-static inline struct xfs_ag_resv *
-xfs_perag_resv(
- struct xfs_perag *pag,
- enum xfs_ag_resv_type type)
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks. This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ * the actual blocks that will fill per-AG metadata space reservations
+ */
+static inline uint64_t
+xfs_fdblocks_unavailable(
+ struct xfs_mount *mp)
{
- switch (type) {
- case XFS_AG_RESV_METADATA:
- return &pag->pag_meta_resv;
- case XFS_AG_RESV_RMAPBT:
- return &pag->pag_rmapbt_resv;
- default:
- return NULL;
- }
+ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-int xfs_buf_hash_init(xfs_perag_t *pag);
-void xfs_buf_hash_destroy(xfs_perag_t *pag);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ int64_t delta, bool rsvd);
-extern void xfs_uuid_table_free(void);
-extern int xfs_log_sbcount(xfs_mount_t *);
-extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
-extern int xfs_mountfs(xfs_mount_t *mp);
-extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
- xfs_agnumber_t *maxagi);
-extern void xfs_unmountfs(xfs_mount_t *);
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+ return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
-extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
-extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
-extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
- bool reserved);
-extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+ return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
@@ -411,6 +544,8 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
int error_class, int error);
void xfs_force_summary_recalc(struct xfs_mount *mp);
+int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
+bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index a06661dac5be..f85e3b07ab44 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -219,7 +219,7 @@ _xfs_mru_cache_list_insert(
* When destroying or reaping, all the elements that were migrated to the reap
* list need to be deleted. For each element this involves removing it from the
* data store, removing it from the reap list, calling the client's free
- * function and deleting the element from the element zone.
+ * function and deleting the element from the element cache.
*
* We get called holding the mru->lock, which we drop and then reacquire.
* Sparse need special help with this to tell it we know what we are doing.
@@ -294,7 +294,7 @@ int
xfs_mru_cache_init(void)
{
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
new file mode 100644
index 000000000000..c4078d0ec108
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Fujitsu. All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+
+#include <linux/mm.h>
+#include <linux/dax.h>
+
+struct xfs_failure_info {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+ int mf_flags;
+ bool want_shutdown;
+};
+
+static pgoff_t
+xfs_failure_pgoff(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rec,
+ const struct xfs_failure_info *notify)
+{
+ loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
+
+ if (notify->startblock > rec->rm_startblock)
+ pos += XFS_FSB_TO_B(mp,
+ notify->startblock - rec->rm_startblock);
+ return pos >> PAGE_SHIFT;
+}
+
+static unsigned long
+xfs_failure_pgcnt(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rec,
+ const struct xfs_failure_info *notify)
+{
+ xfs_agblock_t end_rec;
+ xfs_agblock_t end_notify;
+ xfs_agblock_t start_cross;
+ xfs_agblock_t end_cross;
+
+ start_cross = max(rec->rm_startblock, notify->startblock);
+
+ end_rec = rec->rm_startblock + rec->rm_blockcount;
+ end_notify = notify->startblock + notify->blockcount;
+ end_cross = min(end_rec, end_notify);
+
+ return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
+}
+
+static int
+xfs_dax_failure_fn(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *data)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip;
+ struct xfs_failure_info *notify = data;
+ int error = 0;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+ (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ notify->want_shutdown = true;
+ return 0;
+ }
+
+ /* Get files that incore, filter out others that are not in use. */
+ error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+ 0, &ip);
+ /* Continue the rmap query if the inode isn't incore */
+ if (error == -ENODATA)
+ return 0;
+ if (error) {
+ notify->want_shutdown = true;
+ return 0;
+ }
+
+ error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
+ xfs_failure_pgoff(mp, rec, notify),
+ xfs_failure_pgcnt(mp, rec, notify),
+ notify->mf_flags);
+ xfs_irele(ip);
+ return error;
+}
+
+static int
+xfs_dax_notify_ddev_failure(
+ struct xfs_mount *mp,
+ xfs_daddr_t daddr,
+ xfs_daddr_t bblen,
+ int mf_flags)
+{
+ struct xfs_failure_info notify = { .mf_flags = mf_flags };
+ struct xfs_trans *tp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_buf *agf_bp = NULL;
+ int error = 0;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+ xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
+ xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ for (; agno <= end_agno; agno++) {
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_agf *agf;
+ xfs_agblock_t agend;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+ if (error) {
+ xfs_perag_put(pag);
+ break;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+
+ /*
+ * Set the rmap range from ri_low to ri_high, which represents
+ * a [start, end] where we looking for the files or metadata.
+ */
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+ ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
+ if (agno == end_agno)
+ ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+
+ agf = agf_bp->b_addr;
+ agend = min(be32_to_cpu(agf->agf_length),
+ ri_high.rm_startblock);
+ notify.startblock = ri_low.rm_startblock;
+ notify.blockcount = agend - ri_low.rm_startblock;
+
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_dax_failure_fn, &notify);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agf_bp);
+ xfs_perag_put(pag);
+ if (error)
+ break;
+
+ fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+ }
+
+ xfs_trans_cancel(tp);
+ if (error || notify.want_shutdown) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ if (!error)
+ error = -EFSCORRUPTED;
+ }
+ return error;
+}
+
+static int
+xfs_dax_notify_failure(
+ struct dax_device *dax_dev,
+ u64 offset,
+ u64 len,
+ int mf_flags)
+{
+ struct xfs_mount *mp = dax_holder(dax_dev);
+ u64 ddev_start;
+ u64 ddev_end;
+
+ if (!(mp->m_super->s_flags & SB_BORN)) {
+ xfs_warn(mp, "filesystem is not ready for notify_failure()!");
+ return -EIO;
+ }
+
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
+ xfs_debug(mp,
+ "notify_failure() not supported on realtime device!");
+ return -EOPNOTSUPP;
+ }
+
+ if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
+ mp->m_logdev_targp != mp->m_ddev_targp) {
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_has_rmapbt(mp)) {
+ xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
+ return -EOPNOTSUPP;
+ }
+
+ ddev_start = mp->m_ddev_targp->bt_dax_part_off;
+ ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+
+ /* Ignore the range out of filesystem area */
+ if (offset + len < ddev_start)
+ return -ENXIO;
+ if (offset > ddev_end)
+ return -ENXIO;
+
+ /* Calculate the real range when it touches the boundary */
+ if (offset > ddev_start)
+ offset -= ddev_start;
+ else {
+ len -= ddev_start - offset;
+ offset = 0;
+ }
+ if (offset + len > ddev_end)
+ len -= ddev_end - offset;
+
+ return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
+ mf_flags);
+}
+
+const struct dax_holder_operations xfs_dax_holder_operations = {
+ .notify_failure = xfs_dax_notify_failure,
+};
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 5f04d8a5ab2a..9737b5a9f405 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -15,6 +15,10 @@
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
+#define XFS_CHECK_VALUE(value, expected) \
+ BUILD_BUG_ON_MSG((value) != (expected), \
+ "XFS: value of " #value " is wrong, expected " #expected)
+
static inline void __init
xfs_check_ondisk_structs(void)
{
@@ -23,7 +27,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
@@ -41,7 +45,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
@@ -84,12 +89,12 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
@@ -113,19 +118,37 @@ xfs_check_ondisk_structs(void)
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_log_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_cui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_cud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rui_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
+
+ XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format, efi_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
+ XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
/*
* The v5 superblock format extended several v4 header structures with
@@ -152,6 +175,20 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64);
+
+ /*
+ * Make sure the incore inode timestamp range corresponds to hand
+ * converted values based on the ondisk format specification.
+ */
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET,
+ XFS_LEGACY_TIME_MIN);
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET,
+ 16299260424LL);
+
+ /* Do the same with the incore quota expiration range. */
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
+ 16299260424LL);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index bb3008d390aa..37a24f0f7cd4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -58,9 +58,8 @@ xfs_fs_get_uuid(
{
struct xfs_mount *mp = XFS_M(sb);
- printk_once(KERN_NOTICE
-"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
- mp->m_super->s_id);
+ xfs_notice_once(mp,
+"Using experimental pNFS feature, use at your own risk!");
if (*len < sizeof(uuid_t))
return -EINVAL;
@@ -72,6 +71,40 @@ xfs_fs_get_uuid(
}
/*
+ * We cannot use file based VFS helpers such as file_modified() to update
+ * inode state as we modify the data/metadata in the inode here. Hence we have
+ * to open code the timestamp updates and SUID/SGID stripping. We also need
+ * to set the inode prealloc flag to ensure that the extents we allocate are not
+ * removed if the inode is reclaimed from memory before xfs_fs_block_commit()
+ * is from the client to indicate that data has been written and the file size
+ * can be extended.
+ */
+static int
+xfs_fs_map_update_inode(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+ 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ VFS_I(ip)->i_mode &= ~S_ISUID;
+ if (VFS_I(ip)->i_mode & S_IXGRP)
+ VFS_I(ip)->i_mode &= ~S_ISGID;
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return xfs_trans_commit(tp);
+}
+
+/*
* Get a layout for the pNFS client.
*/
int
@@ -93,7 +126,7 @@ xfs_fs_map_blocks(
uint lock_flags;
int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -135,7 +168,7 @@ xfs_fs_map_blocks(
goto out_unlock;
error = invalidate_inode_pages2(inode->i_mapping);
if (WARN_ON_ONCE(error))
- return error;
+ goto out_unlock;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -156,7 +189,7 @@ xfs_fs_map_blocks(
xfs_iunlock(ip, lock_flags);
error = xfs_iomap_write_direct(ip, offset_fsb,
- end_fsb - offset_fsb, &imap);
+ end_fsb - offset_fsb, 0, &imap);
if (error)
goto out_unlock;
@@ -165,16 +198,18 @@ xfs_fs_map_blocks(
* that the blocks allocated and handed out to the client are
* guaranteed to be present even after a server crash.
*/
- error = xfs_update_prealloc_flags(ip,
- XFS_PREALLOC_SET | XFS_PREALLOC_SYNC);
+ error = xfs_fs_map_update_inode(ip);
+ if (!error)
+ error = xfs_log_force_inode(ip);
if (error)
goto out_unlock;
+
} else {
xfs_iunlock(ip, lock_flags);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
@@ -256,7 +291,7 @@ xfs_fs_commit_blocks(
length = end - start;
if (!length)
continue;
-
+
/*
* Make sure reads through the pagecache see the new data.
*/
@@ -284,10 +319,11 @@ xfs_fs_commit_blocks(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_setattr_time(ip, iattr);
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(&init_user_ns, inode, iattr);
if (update_isize) {
i_size_write(inode, iattr->ia_size);
- ip->i_d.di_size = iattr->ia_size;
+ ip->i_disk_size = iattr->ia_size;
}
xfs_trans_set_sync(tp);
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
index 4bcc3e61056c..c283b801cc5d 100644
--- a/fs/xfs/xfs_pwork.c
+++ b/fs/xfs/xfs_pwork.c
@@ -61,16 +61,18 @@ xfs_pwork_init(
struct xfs_mount *mp,
struct xfs_pwork_ctl *pctl,
xfs_pwork_work_fn work_fn,
- const char *tag,
- unsigned int nr_threads)
+ const char *tag)
{
+ unsigned int nr_threads = 0;
+
#ifdef DEBUG
if (xfs_globals.pwork_threads >= 0)
nr_threads = xfs_globals.pwork_threads;
#endif
trace_xfs_pwork_init(mp, nr_threads, current->pid);
- pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+ pctl->wq = alloc_workqueue("%s-%d",
+ WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag,
current->pid);
if (!pctl->wq)
return -ENOMEM;
@@ -117,20 +119,3 @@ xfs_pwork_poll(
atomic_read(&pctl->nr_work) == 0, HZ) == 0)
touch_softlockup_watchdog();
}
-
-/*
- * Return the amount of parallelism that the data device can handle, or 0 for
- * no limit.
- */
-unsigned int
-xfs_pwork_guess_datadev_parallelism(
- struct xfs_mount *mp)
-{
- struct xfs_buftarg *btp = mp->m_ddev_targp;
-
- /*
- * For now we'll go with the most conservative setting possible,
- * which is two threads for an SSD and 1 thread everywhere else.
- */
- return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
-}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
index 8133124cf3bb..c0ef81fc85dd 100644
--- a/fs/xfs/xfs_pwork.h
+++ b/fs/xfs/xfs_pwork.h
@@ -51,11 +51,9 @@ xfs_pwork_want_abort(
}
int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
- xfs_pwork_work_fn work_fn, const char *tag,
- unsigned int nr_threads);
+ xfs_pwork_work_fn work_fn, const char *tag);
void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
-unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 0b0909657bad..18bb4ec4d7c9 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -23,6 +23,9 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -47,7 +50,7 @@ STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
STATIC int
xfs_qm_dquot_walk(
struct xfs_mount *mp,
- int type,
+ xfs_dqtype_t type,
int (*execute)(struct xfs_dquot *dqp, void *data),
void *data)
{
@@ -79,7 +82,7 @@ restart:
for (i = 0; i < nr_found; i++) {
struct xfs_dquot *dqp = batch[i];
- next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
+ next_index = dqp->q_id + 1;
error = execute(batch[i], data);
if (error == -EAGAIN) {
@@ -119,16 +122,14 @@ xfs_qm_dqpurge(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
+ int error = -EAGAIN;
xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- return -EAGAIN;
- }
+ if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0)
+ goto out_unlock;
- dqp->dq_flags |= XFS_DQ_FREEING;
+ dqp->q_flags |= XFS_DQFLAG_FREEING;
xfs_dqflock(dqp);
@@ -139,7 +140,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
- int error;
/*
* We don't care about getting disk errors here. We need
@@ -149,19 +149,21 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
+ } else if (error == -EAGAIN) {
+ dqp->q_flags &= ~XFS_DQFLAG_FREEING;
+ goto out_unlock;
}
xfs_dqflock(dqp);
}
ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) ||
!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
- be32_to_cpu(dqp->q_core.d_id));
+ radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
qi->qi_dquots--;
/*
@@ -170,26 +172,26 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(mp, xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
}
/*
* Purge the dquot cache.
*/
-void
+static void
xfs_qm_dqpurge_all(
- struct xfs_mount *mp,
- uint flags)
+ struct xfs_mount *mp)
{
- if (flags & XFS_QMOPT_UQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
- if (flags & XFS_QMOPT_GQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
- if (flags & XFS_QMOPT_PQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL);
}
/*
@@ -200,7 +202,7 @@ xfs_qm_unmount(
struct xfs_mount *mp)
{
if (mp->m_quotainfo) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ xfs_qm_dqpurge_all(mp);
xfs_qm_destroy_quotainfo(mp);
}
}
@@ -245,8 +247,7 @@ xfs_qm_unmount_quotas(
STATIC int
xfs_qm_dqattach_one(
struct xfs_inode *ip,
- xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
bool doalloc,
struct xfs_dquot **IO_idqpp)
{
@@ -294,8 +295,6 @@ xfs_qm_need_dqattach(
{
struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return false;
if (!XFS_IS_QUOTA_ON(mp))
return false;
if (!XFS_NOT_DQATTACHED(mp, ip))
@@ -326,7 +325,7 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
doalloc, &ip->i_udquot);
if (error)
goto done;
@@ -334,7 +333,7 @@ xfs_qm_dqattach_locked(
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP,
doalloc, &ip->i_gdquot);
if (error)
goto done;
@@ -342,7 +341,7 @@ xfs_qm_dqattach_locked(
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -469,7 +468,7 @@ xfs_qm_dquot_isolate(
/*
* Prevent lookups now that we are past the point of no return.
*/
- dqp->dq_flags |= XFS_DQ_FREEING;
+ dqp->q_flags |= XFS_DQFLAG_FREEING;
xfs_dqunlock(dqp);
ASSERT(dqp->q_nrefs == 0);
@@ -541,31 +540,29 @@ xfs_qm_shrink_count(
STATIC void
xfs_qm_set_defquota(
struct xfs_mount *mp,
- uint type,
+ xfs_dqtype_t type,
struct xfs_quotainfo *qinf)
{
struct xfs_dquot *dqp;
struct xfs_def_quota *defq;
- struct xfs_disk_dquot *ddqp;
int error;
error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
if (error)
return;
- ddqp = &dqp->q_core;
- defq = xfs_get_defquota(dqp, qinf);
+ defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp));
/*
* Timers and warnings have been already set, let's just set the
* default limits for this quota type
*/
- defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ defq->blk.hard = dqp->q_blk.hardlimit;
+ defq->blk.soft = dqp->q_blk.softlimit;
+ defq->ino.hard = dqp->q_ino.hardlimit;
+ defq->ino.soft = dqp->q_ino.softlimit;
+ defq->rtb.hard = dqp->q_rtb.hardlimit;
+ defq->rtb.soft = dqp->q_rtb.softlimit;
xfs_qm_dqdestroy(dqp);
}
@@ -573,19 +570,18 @@ xfs_qm_set_defquota(
static void
xfs_qm_init_timelimits(
struct xfs_mount *mp,
- struct xfs_quotainfo *qinf)
+ xfs_dqtype_t type)
{
- struct xfs_disk_dquot *ddqp;
+ struct xfs_quotainfo *qinf = mp->m_quotainfo;
+ struct xfs_def_quota *defq;
struct xfs_dquot *dqp;
- uint type;
int error;
- qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+ defq = xfs_get_defquota(qinf, type);
+
+ defq->blk.time = XFS_QM_BTIMELIMIT;
+ defq->ino.time = XFS_QM_ITIMELIMIT;
+ defq->rtb.time = XFS_QM_RTBTIMELIMIT;
/*
* We try to get the limits from the superuser's limits fields.
@@ -593,39 +589,22 @@ xfs_qm_init_timelimits(
*
* Since we may not have done a quotacheck by this point, just read
* the dquot without attaching it to any hashtables or lists.
- *
- * Timers and warnings are globally set by the first timer found in
- * user/group/proj quota types, otherwise a default value is used.
- * This should be split into different fields per quota type.
*/
- if (XFS_IS_UQUOTA_RUNNING(mp))
- type = XFS_DQ_USER;
- else if (XFS_IS_GQUOTA_RUNNING(mp))
- type = XFS_DQ_GROUP;
- else
- type = XFS_DQ_PROJ;
error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
if (error)
return;
- ddqp = &dqp->q_core;
/*
* The warnings and timers set the grace period given to
* a user or group before he or she can not perform any
* more writing. If it is zero, a default is used.
*/
- if (ddqp->d_btimer)
- qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer);
- if (ddqp->d_itimer)
- qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer);
- if (ddqp->d_rtbtimer)
- qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer);
- if (ddqp->d_bwarns)
- qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns);
- if (ddqp->d_iwarns)
- qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns);
- if (ddqp->d_rtbwarns)
- qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns);
+ if (dqp->q_blk.timer)
+ defq->blk.time = dqp->q_blk.timer;
+ if (dqp->q_ino.timer)
+ defq->ino.time = dqp->q_ino.timer;
+ if (dqp->q_rtb.timer)
+ defq->rtb.time = dqp->q_rtb.timer;
xfs_qm_dqdestroy(dqp);
}
@@ -641,7 +620,7 @@ xfs_qm_init_quotainfo(
struct xfs_quotainfo *qinf;
int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
@@ -668,24 +647,38 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
+ if (xfs_has_bigtime(mp)) {
+ qinf->qi_expiry_min =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN);
+ qinf->qi_expiry_max =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX);
+ } else {
+ qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN;
+ qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX;
+ }
+ trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min,
+ qinf->qi_expiry_max);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
- xfs_qm_init_timelimits(mp, qinf);
+ xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER);
+ xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP);
+ xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ);
- if (XFS_IS_UQUOTA_RUNNING(mp))
- xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
- if (XFS_IS_GQUOTA_RUNNING(mp))
- xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
- if (XFS_IS_PQUOTA_RUNNING(mp))
- xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
+ if (XFS_IS_UQUOTA_ON(mp))
+ xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf);
+ if (XFS_IS_GQUOTA_ON(mp))
+ xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf);
+ if (XFS_IS_PQUOTA_ON(mp))
+ xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
- error = register_shrinker(&qinf->qi_shrinker);
+ error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
+ mp->m_super->s_id);
if (error)
goto out_free_inos;
@@ -732,15 +725,15 @@ xfs_qm_destroy_quotainfo(
*/
STATIC int
xfs_qm_qino_alloc(
- xfs_mount_t *mp,
- xfs_inode_t **ip,
- uint flags)
+ struct xfs_mount *mp,
+ struct xfs_inode **ipp,
+ unsigned int flags)
{
- xfs_trans_t *tp;
- int error;
- bool need_alloc = true;
+ struct xfs_trans *tp;
+ int error;
+ bool need_alloc = true;
- *ip = NULL;
+ *ipp = NULL;
/*
* With superblock that doesn't have separate pquotino, we
* share an inode between gquota and pquota. If the on-disk
@@ -748,7 +741,7 @@ xfs_qm_qino_alloc(
* with PQUOTA, just use sb_gquotino for sb_pquotino and
* vice-versa.
*/
- if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+ if (!xfs_has_pquotino(mp) &&
(flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
xfs_ino_t ino = NULLFSINO;
@@ -766,7 +759,7 @@ xfs_qm_qino_alloc(
return -EFSCORRUPTED;
}
if (ino != NULLFSINO) {
- error = xfs_iget(mp, NULL, ino, 0, 0, ip);
+ error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
if (error)
return error;
mp->m_sb.sb_gquotino = NULLFSINO;
@@ -776,12 +769,18 @@ xfs_qm_qino_alloc(
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
- XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+ need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0,
+ 0, 0, &tp);
if (error)
return error;
if (need_alloc) {
- error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip);
+ xfs_ino_t ino;
+
+ error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
+ if (!error)
+ error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino,
+ S_IFREG, 1, 0, 0, false, ipp);
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -795,9 +794,9 @@ xfs_qm_qino_alloc(
*/
spin_lock(&mp->m_sb_lock);
if (flags & XFS_QMOPT_SBVERSION) {
- ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
+ ASSERT(!xfs_has_quota(mp));
- xfs_sb_version_addquota(&mp->m_sb);
+ xfs_add_quota(mp);
mp->m_sb.sb_uquotino = NULLFSINO;
mp->m_sb.sb_gquotino = NULLFSINO;
mp->m_sb.sb_pquotino = NULLFSINO;
@@ -806,35 +805,34 @@ xfs_qm_qino_alloc(
mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
}
if (flags & XFS_QMOPT_UQUOTA)
- mp->m_sb.sb_uquotino = (*ip)->i_ino;
+ mp->m_sb.sb_uquotino = (*ipp)->i_ino;
else if (flags & XFS_QMOPT_GQUOTA)
- mp->m_sb.sb_gquotino = (*ip)->i_ino;
+ mp->m_sb.sb_gquotino = (*ipp)->i_ino;
else
- mp->m_sb.sb_pquotino = (*ip)->i_ino;
+ mp->m_sb.sb_pquotino = (*ipp)->i_ino;
spin_unlock(&mp->m_sb_lock);
xfs_log_sb(tp);
error = xfs_trans_commit(tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
}
if (need_alloc)
- xfs_finish_inode_setup(*ip);
+ xfs_finish_inode_setup(*ipp);
return error;
}
STATIC void
xfs_qm_reset_dqcounts(
- xfs_mount_t *mp,
- xfs_buf_t *bp,
- xfs_dqid_t id,
- uint type)
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type)
{
struct xfs_dqblk *dqb;
int j;
- xfs_failaddr_t fa;
trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -844,7 +842,7 @@ xfs_qm_reset_dqcounts(
*/
#ifdef DEBUG
j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
- sizeof(xfs_dqblk_t);
+ sizeof(struct xfs_dqblk);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
@@ -859,26 +857,36 @@ xfs_qm_reset_dqcounts(
* find uninitialised dquot blks. See comment in
* xfs_dquot_verify.
*/
- fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type);
- if (fa)
+ if (xfs_dqblk_verify(mp, &dqb[j], id + j) ||
+ (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type)
xfs_dqblk_repair(mp, &dqb[j], id + j, type);
/*
* Reset type in case we are reusing group quota file for
* project quotas or vice versa
*/
- ddq->d_flags = type;
+ ddq->d_type = type;
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
- ddq->d_btimer = 0;
- ddq->d_itimer = 0;
- ddq->d_rtbtimer = 0;
- ddq->d_bwarns = 0;
- ddq->d_iwarns = 0;
- ddq->d_rtbwarns = 0;
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+
+ /*
+ * dquot id 0 stores the default grace period and the maximum
+ * warning limit that were set by the administrator, so we
+ * should not reset them.
+ */
+ if (ddq->d_id != 0) {
+ ddq->d_btimer = 0;
+ ddq->d_itimer = 0;
+ ddq->d_rtbtimer = 0;
+ ddq->d_bwarns = 0;
+ ddq->d_iwarns = 0;
+ ddq->d_rtbwarns = 0;
+ if (xfs_has_bigtime(mp))
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
+ }
+
+ if (xfs_has_crc(mp)) {
xfs_update_cksum((char *)&dqb[j],
sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -892,17 +900,13 @@ xfs_qm_reset_dqcounts_all(
xfs_dqid_t firstid,
xfs_fsblock_t bno,
xfs_filblks_t blkcnt,
- uint flags,
+ xfs_dqtype_t type,
struct list_head *buffer_list)
{
struct xfs_buf *bp;
- int error;
- int type;
+ int error = 0;
ASSERT(blkcnt > 0);
- type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
- (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
- error = 0;
/*
* Blkcnt arg can be a very big number, and might even be
@@ -962,7 +966,7 @@ STATIC int
xfs_qm_reset_dqcounts_buf(
struct xfs_mount *mp,
struct xfs_inode *qip,
- uint flags,
+ xfs_dqtype_t type,
struct list_head *buffer_list)
{
struct xfs_bmbt_irec *map;
@@ -980,7 +984,7 @@ xfs_qm_reset_dqcounts_buf(
* trans_reserve. But, this gets called during quotacheck, and that
* happens only at mount time which is single threaded.
*/
- if (qip->i_d.di_nblocks == 0)
+ if (qip->i_nblocks == 0)
return 0;
map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
@@ -1038,7 +1042,7 @@ xfs_qm_reset_dqcounts_buf(
error = xfs_qm_reset_dqcounts_all(mp, firstid,
map[i].br_startblock,
map[i].br_blockcount,
- flags, buffer_list);
+ type, buffer_list);
if (error)
goto out;
}
@@ -1060,7 +1064,7 @@ out:
STATIC int
xfs_qm_quotacheck_dqadjust(
struct xfs_inode *ip,
- uint type,
+ xfs_dqtype_t type,
xfs_qcnt_t nblks,
xfs_qcnt_t rtblks)
{
@@ -1086,15 +1090,15 @@ xfs_qm_quotacheck_dqadjust(
* Adjust the inode count and the block count to reflect this inode's
* resource usage.
*/
- be64_add_cpu(&dqp->q_core.d_icount, 1);
- dqp->q_res_icount++;
+ dqp->q_ino.count++;
+ dqp->q_ino.reserved++;
if (nblks) {
- be64_add_cpu(&dqp->q_core.d_bcount, nblks);
- dqp->q_res_bcount += nblks;
+ dqp->q_blk.count += nblks;
+ dqp->q_blk.reserved += nblks;
}
if (rtblks) {
- be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
- dqp->q_res_rtbcount += rtblks;
+ dqp->q_rtb.count += rtblks;
+ dqp->q_rtb.reserved += rtblks;
}
/*
@@ -1102,12 +1106,12 @@ xfs_qm_quotacheck_dqadjust(
*
* There are no timers for the default values set in the root dquot.
*/
- if (dqp->q_core.d_id) {
- xfs_qm_adjust_dqlimits(mp, dqp);
- xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+ if (dqp->q_id) {
+ xfs_qm_adjust_dqlimits(dqp);
+ xfs_qm_adjust_dqtimers(dqp);
}
- dqp->dq_flags |= XFS_DQ_DIRTY;
+ dqp->q_flags |= XFS_DQFLAG_DIRTY;
xfs_qm_dqput(dqp);
return 0;
}
@@ -1129,7 +1133,7 @@ xfs_qm_dqusage_adjust(
xfs_filblks_t rtblks = 0; /* total rt blks */
int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
/*
* rootino must have its resources accounted for, not so with the quota
@@ -1151,18 +1155,16 @@ xfs_qm_dqusage_adjust(
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
- if (error)
- goto error0;
- }
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto error0;
xfs_bmap_count_leaves(ifp, &rtblks);
}
- nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+ nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
/*
* Add the (disk blocks and inode) resources occupied by this
@@ -1177,21 +1179,21 @@ xfs_qm_dqusage_adjust(
* and quotaoffs don't race. (Quotachecks happen at mount time only).
*/
if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks,
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks,
rtblks);
if (error)
goto error0;
}
if (XFS_IS_GQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks,
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks,
rtblks);
if (error)
goto error0;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks,
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks,
rtblks);
if (error)
goto error0;
@@ -1213,7 +1215,7 @@ xfs_qm_flush_one(
int error = 0;
xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING)
+ if (dqp->q_flags & XFS_DQFLAG_FREEING)
goto out_unlock;
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
@@ -1228,10 +1230,14 @@ xfs_qm_flush_one(
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
- bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0);
- if (!bp) {
- error = -EINVAL;
+ error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ goto out_unlock;
+
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ error = -EAGAIN;
+ xfs_buf_relse(bp);
goto out_unlock;
}
xfs_buf_unlock(bp);
@@ -1272,7 +1278,7 @@ xfs_qm_quotacheck(
flags = 0;
ASSERT(uip || gip || pip);
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1282,7 +1288,7 @@ xfs_qm_quotacheck(
* We don't log our changes till later.
*/
if (uip) {
- error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER,
&buffer_list);
if (error)
goto error_return;
@@ -1290,7 +1296,7 @@ xfs_qm_quotacheck(
}
if (gip) {
- error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP,
&buffer_list);
if (error)
goto error_return;
@@ -1298,7 +1304,7 @@ xfs_qm_quotacheck(
}
if (pip) {
- error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ,
&buffer_list);
if (error)
goto error_return;
@@ -1307,25 +1313,32 @@ xfs_qm_quotacheck(
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
- if (error)
+ if (error) {
+ /*
+ * The inode walk may have partially populated the dquot
+ * caches. We must purge them before disabling quota and
+ * tearing down the quotainfo, or else the dquots will leak.
+ */
+ xfs_qm_dqpurge_all(mp);
goto error_return;
+ }
/*
* We've made all the changes that we need to make incore. Flush them
* down to disk buffers if everything was updated successfully.
*/
if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
+ error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one,
&buffer_list);
}
if (XFS_IS_GQUOTA_ON(mp)) {
- error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one,
&buffer_list);
if (!error)
error = error2;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one,
&buffer_list);
if (!error)
error = error2;
@@ -1343,7 +1356,7 @@ xfs_qm_quotacheck(
* at this point (because we intentionally didn't in dqget_noattach).
*/
if (error) {
- xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ xfs_qm_dqpurge_all(mp);
goto error_return;
}
@@ -1402,7 +1415,7 @@ xfs_qm_mount_quotas(
goto write_changes;
}
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ ASSERT(XFS_IS_QUOTA_ON(mp));
/*
* Allocate the quotainfo structure inside the mount struct, and
@@ -1457,7 +1470,7 @@ xfs_qm_mount_quotas(
* the incore structures are convinced that quotas are
* off, but the on disk superblock doesn't know that !
*/
- ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+ ASSERT(!(XFS_IS_QUOTA_ON(mp)));
xfs_alert(mp, "%s: Superblock update failed!",
__func__);
}
@@ -1488,7 +1501,7 @@ xfs_qm_init_quotainos(
/*
* Get the uquota and gquota inodes
*/
- if (xfs_sb_version_hasquota(&mp->m_sb)) {
+ if (xfs_has_quota(mp)) {
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
@@ -1588,8 +1601,7 @@ xfs_qm_dqfree_one(
struct xfs_quotainfo *qi = mp->m_quotainfo;
mutex_lock(&qi->qi_tree_lock);
- radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
- be32_to_cpu(dqp->q_core.d_id));
+ radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
qi->qi_dquots--;
mutex_unlock(&qi->qi_tree_lock);
@@ -1613,8 +1625,8 @@ xfs_qm_dqfree_one(
int
xfs_qm_vop_dqalloc(
struct xfs_inode *ip,
- xfs_dqid_t uid,
- xfs_dqid_t gid,
+ kuid_t uid,
+ kgid_t gid,
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
@@ -1622,20 +1634,22 @@ xfs_qm_vop_dqalloc(
struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ struct user_namespace *user_ns = inode->i_sb->s_user_ns;
struct xfs_dquot *uq = NULL;
struct xfs_dquot *gq = NULL;
struct xfs_dquot *pq = NULL;
int error;
uint lockflags;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
lockflags = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockflags);
if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
- gid = ip->i_d.di_gid;
+ gid = inode->i_gid;
/*
* Attach the dquot(s) to this inode, doing a dquot allocation
@@ -1650,7 +1664,8 @@ xfs_qm_vop_dqalloc(
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
- if (ip->i_d.di_uid != uid) {
+ ASSERT(O_udqpp);
+ if (!uid_eq(inode->i_uid, uid)) {
/*
* What we need is the dquot that has this uid, and
* if we send the inode to dqget, the uid of the inode
@@ -1661,7 +1676,8 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
+ error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
+ XFS_DQTYPE_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
@@ -1682,9 +1698,11 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
- if (ip->i_d.di_gid != gid) {
+ ASSERT(O_gdqpp);
+ if (!gid_eq(inode->i_gid, gid)) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
+ error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
+ XFS_DQTYPE_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1698,10 +1716,11 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
- if (ip->i_d.di_projid != prid) {
+ ASSERT(O_pdqpp);
+ if (ip->i_projid != prid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
- true, &pq);
+ error = xfs_qm_dqget(mp, prid,
+ XFS_DQTYPE_PROJ, true, &pq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1714,8 +1733,7 @@ xfs_qm_vop_dqalloc(
pq = xfs_qm_dqhold(ip->i_pdquot);
}
}
- if (uq)
- trace_xfs_dquot_dqalloc(ip);
+ trace_xfs_dquot_dqalloc(ip);
xfs_iunlock(ip, lockflags);
if (O_udqpp)
@@ -1755,123 +1773,50 @@ xfs_qm_vop_chown(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+ ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
/* old dquot */
prevdq = *IO_olddq;
ASSERT(prevdq);
ASSERT(prevdq != newdq);
- xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
+ xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks));
xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
/* the sparkling new dquot */
- xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
+ xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks);
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
- * Take an extra reference, because the inode is going to keep
- * this dquot pointer even after the trans_commit.
+ * Back when we made quota reservations for the chown, we reserved the
+ * ondisk blocks + delalloc blocks with the new dquot. Now that we've
+ * switched the dquots, decrease the new dquot's block reservation
+ * (having already bumped up the real counter) so that we don't have
+ * any reservation to give back when we commit.
*/
- *IO_olddq = xfs_qm_dqhold(newdq);
-
- return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp,
- struct xfs_dquot *pdqp,
- uint flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t delblks;
- unsigned int blkflags, prjflags = 0;
- struct xfs_dquot *udq_unres = NULL;
- struct xfs_dquot *gdq_unres = NULL;
- struct xfs_dquot *pdq_unres = NULL;
- struct xfs_dquot *udq_delblks = NULL;
- struct xfs_dquot *gdq_delblks = NULL;
- struct xfs_dquot *pdq_delblks = NULL;
- int error;
-
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ -ip->i_delayed_blks);
- delblks = ip->i_delayed_blks;
- blkflags = XFS_IS_REALTIME_INODE(ip) ?
- XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
-
- if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
- udq_delblks = udqp;
- /*
- * If there are delayed allocation blocks, then we have to
- * unreserve those from the old dquot, and add them to the
- * new dquot.
- */
- if (delblks) {
- ASSERT(ip->i_udquot);
- udq_unres = ip->i_udquot;
- }
- }
- if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
- gdq_delblks = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- gdq_unres = ip->i_gdquot;
- }
- }
-
- if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) {
- prjflags = XFS_QMOPT_ENOSPC;
- pdq_delblks = pdqp;
- if (delblks) {
- ASSERT(ip->i_pdquot);
- pdq_unres = ip->i_pdquot;
- }
- }
-
- error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags);
- if (error)
- return error;
+ /*
+ * Give the incore reservation for delalloc blocks back to the old
+ * dquot. We don't normally handle delalloc quota reservations
+ * transactionally, so just lock the dquot and subtract from the
+ * reservation. Dirty the transaction because it's too late to turn
+ * back now.
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_dqlock(prevdq);
+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ xfs_dqunlock(prevdq);
/*
- * Do the delayed blks reservations/unreservations now. Since, these
- * are done without the help of a transaction, if a reservation fails
- * its previous reservations won't be automatically undone by trans
- * code. So, we have to do it manually here.
+ * Take an extra reference, because the inode is going to keep
+ * this dquot pointer even after the trans_commit.
*/
- if (delblks) {
- /*
- * Do the reservations first. Unreservation can't fail.
- */
- ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
- ASSERT(udq_unres || gdq_unres || pdq_unres);
- error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags);
- if (error)
- return error;
- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_unres, gdq_unres, pdq_unres,
- -((xfs_qcnt_t)delblks), 0, blkflags);
- }
+ *IO_olddq = xfs_qm_dqhold(newdq);
- return 0;
+ return prevdq;
}
int
@@ -1881,7 +1826,7 @@ xfs_qm_vop_rename_dqattach(
struct xfs_mount *mp = i_tab[0]->i_mount;
int i;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
for (i = 0; (i < 4 && i_tab[i]); i++) {
@@ -1912,31 +1857,65 @@ xfs_qm_vop_create_dqattach(
{
struct xfs_mount *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+ ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
+ ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
+
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
- ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id));
+ ASSERT(ip->i_projid == pdqp->q_id);
ip->i_pdquot = xfs_qm_dqhold(pdqp);
xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
}
+/* Decide if this inode's dquot is near an enforcement boundary. */
+bool
+xfs_inode_near_dquot_enforcement(
+ struct xfs_inode *ip,
+ xfs_dqtype_t type)
+{
+ struct xfs_dquot *dqp;
+ int64_t freesp;
+
+ /* We only care for quotas that are enabled and enforced. */
+ dqp = xfs_inode_dquot(ip, type);
+ if (!dqp || !xfs_dquot_is_enforced(dqp))
+ return false;
+
+ if (xfs_dquot_res_over_limits(&dqp->q_ino) ||
+ xfs_dquot_res_over_limits(&dqp->q_rtb))
+ return true;
+
+ /* For space on the data device, check the various thresholds. */
+ if (!dqp->q_prealloc_hi_wmark)
+ return false;
+
+ if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark)
+ return false;
+
+ if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark)
+ return true;
+
+ freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved;
+ if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT])
+ return true;
+
+ return false;
+}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 4e57edca8bce..9683f0457d19 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -11,7 +11,7 @@
struct xfs_inode;
-extern struct kmem_zone *xfs_qm_dqtrxzone;
+extern struct kmem_cache *xfs_dqtrx_cache;
/*
* Number of bmaps that we ask from bmapi when doing a quotacheck.
@@ -20,34 +20,27 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
#define XFS_DQITER_MAP_SIZE 10
#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
- !dqp->q_core.d_blk_hardlimit && \
- !dqp->q_core.d_blk_softlimit && \
- !dqp->q_core.d_rtb_hardlimit && \
- !dqp->q_core.d_rtb_softlimit && \
- !dqp->q_core.d_ino_hardlimit && \
- !dqp->q_core.d_ino_softlimit && \
- !dqp->q_core.d_bcount && \
- !dqp->q_core.d_rtbcount && \
- !dqp->q_core.d_icount)
-
-/*
- * This defines the unit of allocation of dquots.
- * Currently, it is just one file system block, and a 4K blk contains 30
- * (136 * 30 = 4080) dquots. It's probably not worth trying to make
- * this more dynamic.
- * XXXsup However, if this number is changed, we have to make sure that we don't
- * implicitly assume that we do allocations in chunks of a single filesystem
- * block in the dquot/xqm code.
- */
-#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+ !dqp->q_blk.hardlimit && \
+ !dqp->q_blk.softlimit && \
+ !dqp->q_rtb.hardlimit && \
+ !dqp->q_rtb.softlimit && \
+ !dqp->q_ino.hardlimit && \
+ !dqp->q_ino.softlimit && \
+ !dqp->q_blk.count && \
+ !dqp->q_rtb.count && \
+ !dqp->q_ino.count)
+
+struct xfs_quota_limits {
+ xfs_qcnt_t hard; /* default hard limit */
+ xfs_qcnt_t soft; /* default soft limit */
+ time64_t time; /* limit for timers */
+};
+/* Defaults for each quota type: time limits, warn limits, usage limits */
struct xfs_def_quota {
- xfs_qcnt_t bhardlimit; /* default data blk hard limit */
- xfs_qcnt_t bsoftlimit; /* default data blk soft limit */
- xfs_qcnt_t ihardlimit; /* default inode count hard limit */
- xfs_qcnt_t isoftlimit; /* default inode count soft limit */
- xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */
- xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */
+ struct xfs_quota_limits blk;
+ struct xfs_quota_limits ino;
+ struct xfs_quota_limits rtb;
};
/*
@@ -55,41 +48,39 @@ struct xfs_def_quota {
* The mount structure keeps a pointer to this.
*/
struct xfs_quotainfo {
- struct radix_tree_root qi_uquota_tree;
- struct radix_tree_root qi_gquota_tree;
- struct radix_tree_root qi_pquota_tree;
- struct mutex qi_tree_lock;
+ struct radix_tree_root qi_uquota_tree;
+ struct radix_tree_root qi_gquota_tree;
+ struct radix_tree_root qi_pquota_tree;
+ struct mutex qi_tree_lock;
struct xfs_inode *qi_uquotaip; /* user quota inode */
struct xfs_inode *qi_gquotaip; /* group quota inode */
struct xfs_inode *qi_pquotaip; /* project quota inode */
- struct list_lru qi_lru;
- int qi_dquots;
- time64_t qi_btimelimit; /* limit for blks timer */
- time64_t qi_itimelimit; /* limit for inodes timer */
- time64_t qi_rtbtimelimit;/* limit for rt blks timer */
- xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
- xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
- xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
- struct mutex qi_quotaofflock;/* to serialize quotaoff */
- xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
- uint qi_dqperchunk; /* # ondisk dqs in above chunk */
+ struct list_lru qi_lru;
+ int qi_dquots;
+ struct mutex qi_quotaofflock;/* to serialize quotaoff */
+ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
+ uint qi_dqperchunk; /* # ondisk dq in above chunk */
struct xfs_def_quota qi_usr_default;
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
- struct shrinker qi_shrinker;
+ struct shrinker qi_shrinker;
+
+ /* Minimum and maximum quota expiration timestamp values. */
+ time64_t qi_expiry_min;
+ time64_t qi_expiry_max;
};
static inline struct radix_tree_root *
xfs_dquot_tree(
struct xfs_quotainfo *qi,
- int type)
+ xfs_dqtype_t type)
{
switch (type) {
- case XFS_DQ_USER:
+ case XFS_DQTYPE_USER:
return &qi->qi_uquota_tree;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return &qi->qi_gquota_tree;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return &qi->qi_pquota_tree;
default:
ASSERT(0);
@@ -98,14 +89,14 @@ xfs_dquot_tree(
}
static inline struct xfs_inode *
-xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
+xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
{
- switch (dq_flags & XFS_DQ_ALLTYPES) {
- case XFS_DQ_USER:
+ switch (type) {
+ case XFS_DQTYPE_USER:
return mp->m_quotainfo->qi_uquotaip;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return mp->m_quotainfo->qi_gquotaip;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return mp->m_quotainfo->qi_pquotaip;
default:
ASSERT(0);
@@ -142,41 +133,39 @@ struct xfs_dquot_acct {
#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_BWARNLIMIT 5
-#define XFS_QM_IWARNLIMIT 5
-#define XFS_QM_RTBWARNLIMIT 5
-
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
-/* dquot stuff */
-extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
-extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
-
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
- uint, struct qc_dqblk *);
-extern int xfs_qm_scall_getquota_next(struct xfs_mount *,
- xfs_dqid_t *, uint, struct qc_dqblk *);
-extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
- struct qc_dqblk *);
+extern int xfs_qm_scall_getquota(struct xfs_mount *mp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type,
+ struct qc_dqblk *dst);
+extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp,
+ xfs_dqid_t *id,
+ xfs_dqtype_t type,
+ struct qc_dqblk *dst);
+extern int xfs_qm_scall_setqlim(struct xfs_mount *mp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type,
+ struct qc_dqblk *newlim);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
static inline struct xfs_def_quota *
-xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
+xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type)
{
- struct xfs_def_quota *defq;
-
- if (XFS_QM_ISUDQ(dqp))
- defq = &qi->qi_usr_default;
- else if (XFS_QM_ISGDQ(dqp))
- defq = &qi->qi_grp_default;
- else {
- ASSERT(XFS_QM_ISPDQ(dqp));
- defq = &qi->qi_prj_default;
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return &qi->qi_usr_default;
+ case XFS_DQTYPE_GROUP:
+ return &qi->qi_grp_default;
+ case XFS_DQTYPE_PROJ:
+ return &qi->qi_prj_default;
+ default:
+ ASSERT(0);
+ return NULL;
}
- return defq;
}
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index fc2fa418919f..b77673dd0558 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -23,24 +23,24 @@ xfs_fill_statvfs_from_dquot(
{
uint64_t limit;
- limit = dqp->q_core.d_blk_softlimit ?
- be64_to_cpu(dqp->q_core.d_blk_softlimit) :
- be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ limit = dqp->q_blk.softlimit ?
+ dqp->q_blk.softlimit :
+ dqp->q_blk.hardlimit;
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
statp->f_bfree = statp->f_bavail =
- (statp->f_blocks > dqp->q_res_bcount) ?
- (statp->f_blocks - dqp->q_res_bcount) : 0;
+ (statp->f_blocks > dqp->q_blk.reserved) ?
+ (statp->f_blocks - dqp->q_blk.reserved) : 0;
}
- limit = dqp->q_core.d_ino_softlimit ?
- be64_to_cpu(dqp->q_core.d_ino_softlimit) :
- be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+ limit = dqp->q_ino.softlimit ?
+ dqp->q_ino.softlimit :
+ dqp->q_ino.hardlimit;
if (limit && statp->f_files > limit) {
statp->f_files = limit;
statp->f_ffree =
- (statp->f_files > dqp->q_res_icount) ?
- (statp->f_files - dqp->q_res_icount) : 0;
+ (statp->f_files > dqp->q_ino.reserved) ?
+ (statp->f_files - dqp->q_ino.reserved) : 0;
}
}
@@ -60,7 +60,7 @@ xfs_qm_statvfs(
struct xfs_mount *mp = ip->i_mount;
struct xfs_dquot *dqp;
- if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) {
+ if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
@@ -75,7 +75,7 @@ xfs_qm_newmount(
uint quotaondisk;
uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
- quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
+ quotaondisk = xfs_has_quota(mp) &&
(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
if (quotaondisk) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 1ea82764bf89..392cb39cc10c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -19,92 +19,11 @@
#include "xfs_qm.h"
#include "xfs_icache.h"
-STATIC int
-xfs_qm_log_quotaoff(
- struct xfs_mount *mp,
- struct xfs_qoff_logitem **qoffstartp,
- uint flags)
-{
- struct xfs_trans *tp;
- int error;
- struct xfs_qoff_logitem *qoffi;
-
- *qoffstartp = NULL;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
- if (error)
- goto out;
-
- qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_log_sb(tp);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp);
- if (error)
- goto out;
-
- *qoffstartp = qoffi;
-out:
- return error;
-}
-
-STATIC int
-xfs_qm_log_quotaoff_end(
- struct xfs_mount *mp,
- struct xfs_qoff_logitem *startqoff,
- uint flags)
-{
- struct xfs_trans *tp;
- int error;
- struct xfs_qoff_logitem *qoffi;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
- if (error)
- return error;
-
- qoffi = xfs_trans_get_qoff_item(tp, startqoff,
- flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
-/*
- * Turn off quota accounting and/or enforcement for all udquots and/or
- * gdquots. Called only at unmount time.
- *
- * This assumes that there are no dquots of this file system cached
- * incore, and modifies the ondisk dquot directly. Therefore, for example,
- * it is an error to call this twice, without purging the cache.
- */
int
xfs_qm_scall_quotaoff(
xfs_mount_t *mp,
uint flags)
{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- uint dqtype;
- int error;
- uint inactivate_flags;
- struct xfs_qoff_logitem *qoffstart;
-
/*
* No file system can have quotas enabled on disk but not in core.
* Note that quota utilities (like quotaoff) _expect_
@@ -112,157 +31,23 @@ xfs_qm_scall_quotaoff(
*/
if ((mp->m_qflags & flags) == 0)
return -EEXIST;
- error = 0;
-
- flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-
- /*
- * We don't want to deal with two quotaoffs messing up each other,
- * so we're going to serialize it. quotaoff isn't exactly a performance
- * critical thing.
- * If quotaoff, then we must be dealing with the root filesystem.
- */
- ASSERT(q);
- mutex_lock(&q->qi_quotaofflock);
-
- /*
- * If we're just turning off quota enforcement, change mp and go.
- */
- if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
- mp->m_qflags &= ~(flags);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = mp->m_qflags;
- spin_unlock(&mp->m_sb_lock);
- mutex_unlock(&q->qi_quotaofflock);
-
- /* XXX what to do if error ? Revert back to old vals incore ? */
- return xfs_sync_sb(mp, false);
- }
-
- dqtype = 0;
- inactivate_flags = 0;
- /*
- * If accounting is off, we must turn enforcement off, clear the
- * quota 'CHKD' certificate to make it known that we have to
- * do a quotacheck the next time this quota is turned on.
- */
- if (flags & XFS_UQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_UQUOTA;
- flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
- inactivate_flags |= XFS_UQUOTA_ACTIVE;
- }
- if (flags & XFS_GQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_GQUOTA;
- flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
- inactivate_flags |= XFS_GQUOTA_ACTIVE;
- }
- if (flags & XFS_PQUOTA_ACCT) {
- dqtype |= XFS_QMOPT_PQUOTA;
- flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
- inactivate_flags |= XFS_PQUOTA_ACTIVE;
- }
-
- /*
- * Nothing to do? Don't complain. This happens when we're just
- * turning off quota enforcement.
- */
- if ((mp->m_qflags & flags) == 0)
- goto out_unlock;
-
- /*
- * Write the LI_QUOTAOFF log record, and do SB changes atomically,
- * and synchronously. If we fail to write, we should abort the
- * operation as it cannot be recovered safely if we crash.
- */
- error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
- if (error)
- goto out_unlock;
-
- /*
- * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
- * to take care of the race between dqget and quotaoff. We don't take
- * any special locks to reset these bits. All processes need to check
- * these bits *after* taking inode lock(s) to see if the particular
- * quota type is in the process of being turned off. If *ACTIVE, it is
- * guaranteed that all dquot structures and all quotainode ptrs will all
- * stay valid as long as that inode is kept locked.
- *
- * There is no turning back after this.
- */
- mp->m_qflags &= ~inactivate_flags;
-
- /*
- * Give back all the dquot reference(s) held by inodes.
- * Here we go thru every single incore inode in this file system, and
- * do a dqrele on the i_udquot/i_gdquot that it may have.
- * Essentially, as long as somebody has an inode locked, this guarantees
- * that quotas will not be turned off. This is handy because in a
- * transaction once we lock the inode(s) and check for quotaon, we can
- * depend on the quota inodes (and other things) being valid as long as
- * we keep the lock(s).
- */
- xfs_qm_dqrele_all_inodes(mp, flags);
-
- /*
- * Next we make the changes in the quota flag in the mount struct.
- * This isn't protected by a particular lock directly, because we
- * don't want to take a mrlock every time we depend on quotas being on.
- */
- mp->m_qflags &= ~flags;
-
- /*
- * Go through all the dquots of this file system and purge them,
- * according to what was turned off.
- */
- xfs_qm_dqpurge_all(mp, dqtype);
-
- /*
- * Transactions that had started before ACTIVE state bit was cleared
- * could have logged many dquots, so they'd have higher LSNs than
- * the first QUOTAOFF log record does. If we happen to crash when
- * the tail of the log has gone past the QUOTAOFF record, but
- * before the last dquot modification, those dquots __will__
- * recover, and that's not good.
- *
- * So, we have QUOTAOFF start and end logitems; the start
- * logitem won't get overwritten until the end logitem appears...
- */
- error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
- if (error) {
- /* We're screwed now. Shutdown is the only option. */
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- goto out_unlock;
- }
/*
- * If all quotas are completely turned off, close shop.
+ * We do not support actually turning off quota accounting any more.
+ * Just log a warning and ignore the accounting related flags.
*/
- if (mp->m_qflags == 0) {
- mutex_unlock(&q->qi_quotaofflock);
- xfs_qm_destroy_quotainfo(mp);
- return 0;
- }
+ if (flags & XFS_ALL_QUOTA_ACCT)
+ xfs_info(mp, "disabling of quota accounting not supported.");
- /*
- * Release our quotainode references if we don't need them anymore.
- */
- if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
- xfs_irele(q->qi_uquotaip);
- q->qi_uquotaip = NULL;
- }
- if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
- xfs_irele(q->qi_gquotaip);
- q->qi_gquotaip = NULL;
- }
- if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
- xfs_irele(q->qi_pquotaip);
- q->qi_pquotaip = NULL;
- }
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ mp->m_qflags &= ~(flags & XFS_ALL_QUOTA_ENFD);
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = mp->m_qflags;
+ spin_unlock(&mp->m_sb_lock);
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
-out_unlock:
- mutex_unlock(&q->qi_quotaofflock);
- return error;
+ /* XXX what to do if error ? Revert back to old vals incore ? */
+ return xfs_sync_sb(mp, false);
}
STATIC int
@@ -292,7 +77,7 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- ip->i_d.di_size = 0;
+ ip->i_disk_size = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
@@ -301,7 +86,7 @@ xfs_qm_scall_trunc_qfile(
goto out_unlock;
}
- ASSERT(ip->i_d.di_nextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp);
@@ -320,24 +105,24 @@ xfs_qm_scall_trunc_qfiles(
{
int error = -EINVAL;
- if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
- (flags & ~XFS_DQ_ALLTYPES)) {
+ if (!xfs_has_quota(mp) || flags == 0 ||
+ (flags & ~XFS_QMOPT_QUOTALL)) {
xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
return -EINVAL;
}
- if (flags & XFS_DQ_USER) {
+ if (flags & XFS_QMOPT_UQUOTA) {
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
if (error)
return error;
}
- if (flags & XFS_DQ_GROUP) {
+ if (flags & XFS_QMOPT_GQUOTA) {
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
if (error)
return error;
}
- if (flags & XFS_DQ_PROJ)
+ if (flags & XFS_QMOPT_PQUOTA)
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
return error;
@@ -356,11 +141,11 @@ xfs_qm_scall_quotaon(
int error;
uint qf;
- flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
/*
- * Switching on quota accounting must be done at mount time.
+ * Switching on quota accounting must be done at mount time,
+ * only consider quota enforcement stuff here.
*/
- flags &= ~(XFS_ALL_QUOTA_ACCT);
+ flags &= XFS_ALL_QUOTA_ENFD;
if (flags == 0) {
xfs_debug(mp, "%s: zero flags, m_qflags=%x",
@@ -419,7 +204,7 @@ xfs_qm_scall_quotaon(
(mp->m_qflags & XFS_GQUOTA_ACCT)))
return 0;
- if (! XFS_IS_QUOTA_RUNNING(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
/*
@@ -432,8 +217,54 @@ xfs_qm_scall_quotaon(
return 0;
}
-#define XFS_QC_MASK \
- (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
+
+/*
+ * Adjust limits of this quota, and the defaults if passed in. Returns true
+ * if the new limits made sense and were applied, false otherwise.
+ */
+static inline bool
+xfs_setqlim_limits(
+ struct xfs_mount *mp,
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ xfs_qcnt_t hard,
+ xfs_qcnt_t soft,
+ const char *tag)
+{
+ /* The hard limit can't be less than the soft limit. */
+ if (hard != 0 && hard < soft) {
+ xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag,
+ soft);
+ return false;
+ }
+
+ res->hardlimit = hard;
+ res->softlimit = soft;
+ if (qlim) {
+ qlim->hard = hard;
+ qlim->soft = soft;
+ }
+
+ return true;
+}
+
+static inline void
+xfs_setqlim_timer(
+ struct xfs_mount *mp,
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ s64 timer)
+{
+ if (qlim) {
+ /* Set the length of the default grace period. */
+ res->timer = xfs_dquot_set_grace_period(timer);
+ qlim->time = res->timer;
+ } else {
+ /* Set the grace period expiration on a quota. */
+ res->timer = xfs_dquot_set_timeout(mp, timer);
+ }
+}
/*
* Adjust quota limits, and start/stop timers accordingly.
@@ -442,14 +273,15 @@ int
xfs_qm_scall_setqlim(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct qc_dqblk *newlim)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_disk_dquot *ddq;
struct xfs_dquot *dqp;
struct xfs_trans *tp;
struct xfs_def_quota *defq;
+ struct xfs_dquot_res *res;
+ struct xfs_quota_limits *qlim;
int error;
xfs_qcnt_t hard, soft;
@@ -459,13 +291,6 @@ xfs_qm_scall_setqlim(
return 0;
/*
- * We don't want to race with a quotaoff so take the quotaoff lock.
- * We don't hold an inode lock, so there's nothing else to stop
- * a quotaoff from happening.
- */
- mutex_lock(&q->qi_quotaofflock);
-
- /*
* Get the dquot (locked) before we start, as we need to do a
* transaction to allocate it if it doesn't exist. Once we have the
* dquot, unlock it so we can start the next transaction safely. We hold
@@ -475,10 +300,10 @@ xfs_qm_scall_setqlim(
error = xfs_qm_dqget(mp, id, type, true, &dqp);
if (error) {
ASSERT(error != -ENOENT);
- goto out_unlock;
+ return error;
}
- defq = xfs_get_defquota(dqp, q);
+ defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
xfs_dqunlock(dqp);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
@@ -487,99 +312,68 @@ xfs_qm_scall_setqlim(
xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
- ddq = &dqp->q_core;
/*
+ * Update quota limits, warnings, and timers, and the defaults
+ * if we're touching id == 0.
+ *
* Make sure that hardlimits are >= soft limits before changing.
+ *
+ * Update warnings counter(s) if requested.
+ *
+ * Timelimits for the super user set the relative time the other users
+ * can be over quota for this file system. If it is zero a default is
+ * used. Ditto for the default soft and hard limit values (already
+ * done, above), and for warnings.
+ *
+ * For other IDs, userspace can bump out the grace period if over
+ * the soft limit.
*/
+
+ /* Blocks on the data device. */
hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
- be64_to_cpu(ddq->d_blk_hardlimit);
+ dqp->q_blk.hardlimit;
soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
- be64_to_cpu(ddq->d_blk_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_blk_hardlimit = cpu_to_be64(hard);
- ddq->d_blk_softlimit = cpu_to_be64(soft);
+ dqp->q_blk.softlimit;
+ res = &dqp->q_blk;
+ qlim = id == 0 ? &defq->blk : NULL;
+
+ if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
xfs_dquot_set_prealloc_limits(dqp);
- if (id == 0) {
- defq->bhardlimit = hard;
- defq->bsoftlimit = soft;
- }
- } else {
- xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
- }
+ if (newlim->d_fieldmask & QC_SPC_TIMER)
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
+
+ /* Blocks on the realtime device. */
hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
- be64_to_cpu(ddq->d_rtb_hardlimit);
+ dqp->q_rtb.hardlimit;
soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
- be64_to_cpu(ddq->d_rtb_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_rtb_hardlimit = cpu_to_be64(hard);
- ddq->d_rtb_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- defq->rtbhardlimit = hard;
- defq->rtbsoftlimit = soft;
- }
- } else {
- xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
- }
+ dqp->q_rtb.softlimit;
+ res = &dqp->q_rtb;
+ qlim = id == 0 ? &defq->rtb : NULL;
+
+ xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
+ if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
+ /* Inodes */
hard = (newlim->d_fieldmask & QC_INO_HARD) ?
(xfs_qcnt_t) newlim->d_ino_hardlimit :
- be64_to_cpu(ddq->d_ino_hardlimit);
+ dqp->q_ino.hardlimit;
soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
(xfs_qcnt_t) newlim->d_ino_softlimit :
- be64_to_cpu(ddq->d_ino_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_ino_hardlimit = cpu_to_be64(hard);
- ddq->d_ino_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- defq->ihardlimit = hard;
- defq->isoftlimit = soft;
- }
- } else {
- xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
- }
+ dqp->q_ino.softlimit;
+ res = &dqp->q_ino;
+ qlim = id == 0 ? &defq->ino : NULL;
- /*
- * Update warnings counter(s) if requested
- */
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
- if (newlim->d_fieldmask & QC_INO_WARNS)
- ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
-
- if (id == 0) {
- /*
- * Timelimits for the super user set the relative time
- * the other users can be over quota for this file system.
- * If it is zero a default is used. Ditto for the default
- * soft and hard limit values (already done, above), and
- * for warnings.
- */
- if (newlim->d_fieldmask & QC_SPC_TIMER) {
- q->qi_btimelimit = newlim->d_spc_timer;
- ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
- }
- if (newlim->d_fieldmask & QC_INO_TIMER) {
- q->qi_itimelimit = newlim->d_ino_timer;
- ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
- }
- if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
- q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
- ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
- }
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- q->qi_bwarnlimit = newlim->d_spc_warns;
- if (newlim->d_fieldmask & QC_INO_WARNS)
- q->qi_iwarnlimit = newlim->d_ino_warns;
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
- } else {
+ xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
+ if (newlim->d_fieldmask & QC_INO_TIMER)
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
+
+ if (id != 0) {
/*
* If the user is now over quota, start the timelimit.
* The user will not be 'warned'.
@@ -587,17 +381,15 @@ xfs_qm_scall_setqlim(
* is on or off. We don't really want to bother with iterating
* over all ondisk dquots and turning the timers on/off.
*/
- xfs_qm_adjust_dqtimers(mp, ddq);
+ xfs_qm_adjust_dqtimers(dqp);
}
- dqp->dq_flags |= XFS_DQ_DIRTY;
+ dqp->q_flags |= XFS_DQFLAG_DIRTY;
xfs_trans_log_dquot(tp, dqp);
error = xfs_trans_commit(tp);
out_rele:
xfs_qm_dqrele(dqp);
-out_unlock:
- mutex_unlock(&q->qi_quotaofflock);
return error;
}
@@ -605,58 +397,46 @@ out_unlock:
static void
xfs_qm_scall_getquota_fill_qc(
struct xfs_mount *mp,
- uint type,
+ xfs_dqtype_t type,
const struct xfs_dquot *dqp,
struct qc_dqblk *dst)
{
memset(dst, 0, sizeof(*dst));
- dst->d_spc_hardlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
- dst->d_spc_softlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
- dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
- dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
- dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
- dst->d_ino_count = dqp->q_res_icount;
- dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
- dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
- dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
- dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
- dst->d_rt_spc_hardlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
- dst->d_rt_spc_softlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
- dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
- dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
- dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+ dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit);
+ dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit);
+ dst->d_ino_hardlimit = dqp->q_ino.hardlimit;
+ dst->d_ino_softlimit = dqp->q_ino.softlimit;
+ dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved);
+ dst->d_ino_count = dqp->q_ino.reserved;
+ dst->d_spc_timer = dqp->q_blk.timer;
+ dst->d_ino_timer = dqp->q_ino.timer;
+ dst->d_ino_warns = 0;
+ dst->d_spc_warns = 0;
+ dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
+ dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
+ dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
+ dst->d_rt_spc_timer = dqp->q_rtb.timer;
+ dst->d_rt_spc_warns = 0;
/*
* Internally, we don't reset all the timers when quota enforcement
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
- dqp->q_core.d_flags == XFS_DQ_USER) ||
- (!XFS_IS_GQUOTA_ENFORCED(mp) &&
- dqp->q_core.d_flags == XFS_DQ_GROUP) ||
- (!XFS_IS_PQUOTA_ENFORCED(mp) &&
- dqp->q_core.d_flags == XFS_DQ_PROJ)) {
+ if (!xfs_dquot_is_enforced(dqp)) {
dst->d_spc_timer = 0;
dst->d_ino_timer = 0;
dst->d_rt_spc_timer = 0;
}
#ifdef DEBUG
- if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
- (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
- (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
- dqp->q_core.d_id != 0) {
+ if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) {
if ((dst->d_space > dst->d_spc_softlimit) &&
(dst->d_spc_softlimit > 0)) {
ASSERT(dst->d_spc_timer != 0);
}
- if ((dst->d_ino_count > dst->d_ino_softlimit) &&
- (dst->d_ino_softlimit > 0)) {
+ if ((dst->d_ino_count > dqp->q_ino.softlimit) &&
+ (dqp->q_ino.softlimit > 0)) {
ASSERT(dst->d_ino_timer != 0);
}
}
@@ -668,13 +448,20 @@ int
xfs_qm_scall_getquota(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct qc_dqblk *dst)
{
struct xfs_dquot *dqp;
int error;
/*
+ * Expedite pending inodegc work at the start of a quota reporting
+ * scan but don't block waiting for it to complete.
+ */
+ if (id == 0)
+ xfs_inodegc_push(mp);
+
+ /*
* Try to get the dquot. We don't want it allocated on disk, so don't
* set doalloc. If it doesn't exist, we'll get ENOENT back.
*/
@@ -706,71 +493,25 @@ int
xfs_qm_scall_getquota_next(
struct xfs_mount *mp,
xfs_dqid_t *id,
- uint type,
+ xfs_dqtype_t type,
struct qc_dqblk *dst)
{
struct xfs_dquot *dqp;
int error;
+ /* Flush inodegc work at the start of a quota reporting scan. */
+ if (*id == 0)
+ xfs_inodegc_push(mp);
+
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
return error;
/* Fill in the ID we actually read from disk */
- *id = be32_to_cpu(dqp->q_core.d_id);
+ *id = dqp->q_id;
xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
xfs_qm_dqput(dqp);
return error;
}
-
-STATIC int
-xfs_dqrele_inode(
- struct xfs_inode *ip,
- int flags,
- void *args)
-{
- /* skip quota inodes */
- if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
- ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
- ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_gdquot == NULL);
- ASSERT(ip->i_pdquot == NULL);
- return 0;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
- xfs_qm_dqrele(ip->i_udquot);
- ip->i_udquot = NULL;
- }
- if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
- xfs_qm_dqrele(ip->i_gdquot);
- ip->i_gdquot = NULL;
- }
- if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
- xfs_qm_dqrele(ip->i_pdquot);
- ip->i_pdquot = NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
-}
-
-
-/*
- * Go thru all the inodes in the file system, releasing their dquots.
- *
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff.
- */
-void
-xfs_qm_dqrele_all_inodes(
- struct xfs_mount *mp,
- uint flags)
-{
- ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
- XFS_AGITER_INEW_WAIT);
-}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index efe42ae7a2f3..dcc785fdd345 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -13,6 +13,7 @@
*/
struct xfs_trans;
+struct xfs_buf;
/*
* This check is done typically without holding the inode lock;
@@ -38,14 +39,14 @@ struct xfs_trans;
static inline uint
xfs_quota_chkd_flag(
- uint dqtype)
+ xfs_dqtype_t type)
{
- switch (dqtype) {
- case XFS_DQ_USER:
+ switch (type) {
+ case XFS_DQTYPE_USER:
return XFS_UQUOTA_CHKD;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return XFS_GQUOTA_CHKD;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return XFS_PQUOTA_CHKD;
default:
return 0;
@@ -80,13 +81,16 @@ extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
-extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
- struct xfs_inode *, int64_t, long, uint);
+int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
+int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp,
+ struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp, int64_t dblocks);
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
@@ -94,9 +98,6 @@ extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
-extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *,
- struct xfs_dquot *, uint);
extern int xfs_qm_dqattach(struct xfs_inode *);
extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -107,9 +108,15 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
+bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
#else
static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
prid_t prid, uint flags, struct xfs_dquot **udqp,
struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
{
@@ -120,11 +127,12 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
}
#define xfs_trans_dup_dqinfo(tp, tp2)
#define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
- struct xfs_inode *ip, int64_t nblks, long ninos, uint flags)
+ struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
+ bool force)
{
return 0;
}
@@ -135,26 +143,40 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
{
return 0;
}
+
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return 0;
+}
+
+static inline int
+xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks)
+{
+ return 0;
+}
+
#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
#define xfs_qm_dqattach(ip) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
-#define xfs_qm_dqrele(d)
-#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_dqrele(d) do { (d) = (d); } while(0)
+#define xfs_qm_statvfs(ip, s) do { } while(0)
#define xfs_qm_newmount(mp, a, b) (0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
+#define xfs_inode_near_dquot_enforcement(ip, type) (false)
#endif /* CONFIG_XFS_QUOTA */
-#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
- xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
- xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
- f | XFS_QMOPT_RES_REGBLKS)
+static inline int
+xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_quota_reserve_blkres(ip, -blocks);
+}
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 38669e827206..9c162e69976b 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -21,10 +21,10 @@ xfs_qm_fill_state(
struct qc_type_state *tstate,
struct xfs_mount *mp,
struct xfs_inode *ip,
- xfs_ino_t ino)
+ xfs_ino_t ino,
+ struct xfs_def_quota *defq)
{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- bool tempqip = false;
+ bool tempqip = false;
tstate->ino = ino;
if (!ip && ino == NULLFSINO)
@@ -35,14 +35,14 @@ xfs_qm_fill_state(
tempqip = true;
}
tstate->flags |= QCI_SYSFILE;
- tstate->blocks = ip->i_d.di_nblocks;
- tstate->nextents = ip->i_d.di_nextents;
- tstate->spc_timelimit = (u32)q->qi_btimelimit;
- tstate->ino_timelimit = (u32)q->qi_itimelimit;
- tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit;
- tstate->spc_warnlimit = q->qi_bwarnlimit;
- tstate->ino_warnlimit = q->qi_iwarnlimit;
- tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit;
+ tstate->blocks = ip->i_nblocks;
+ tstate->nextents = ip->i_df.if_nextents;
+ tstate->spc_timelimit = (u32)defq->blk.time;
+ tstate->ino_timelimit = (u32)defq->ino.time;
+ tstate->rt_spc_timelimit = (u32)defq->rtb.time;
+ tstate->spc_warnlimit = 0;
+ tstate->ino_warnlimit = 0;
+ tstate->rt_spc_warnlimit = 0;
if (tempqip)
xfs_irele(ip);
}
@@ -60,45 +60,45 @@ xfs_fs_get_quota_state(
struct xfs_quotainfo *q = mp->m_quotainfo;
memset(state, 0, sizeof(*state));
- if (!XFS_IS_QUOTA_RUNNING(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
state->s_incoredqs = q->qi_dquots;
- if (XFS_IS_UQUOTA_RUNNING(mp))
+ if (XFS_IS_UQUOTA_ON(mp))
state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_UQUOTA_ENFORCED(mp))
state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
- if (XFS_IS_GQUOTA_RUNNING(mp))
+ if (XFS_IS_GQUOTA_ON(mp))
state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_GQUOTA_ENFORCED(mp))
state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
- if (XFS_IS_PQUOTA_RUNNING(mp))
+ if (XFS_IS_PQUOTA_ON(mp))
state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_PQUOTA_ENFORCED(mp))
state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip,
- mp->m_sb.sb_uquotino);
+ mp->m_sb.sb_uquotino, &q->qi_usr_default);
xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip,
- mp->m_sb.sb_gquotino);
+ mp->m_sb.sb_gquotino, &q->qi_grp_default);
xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip,
- mp->m_sb.sb_pquotino);
+ mp->m_sb.sb_pquotino, &q->qi_prj_default);
return 0;
}
-STATIC int
+STATIC xfs_dqtype_t
xfs_quota_type(int type)
{
switch (type) {
case USRQUOTA:
- return XFS_DQ_USER;
+ return XFS_DQTYPE_USER;
case GRPQUOTA:
- return XFS_DQ_GROUP;
+ return XFS_DQTYPE_GROUP;
default:
- return XFS_DQ_PROJ;
+ return XFS_DQTYPE_PROJ;
}
}
-#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
/*
* Adjust quota timers & warnings
@@ -109,15 +109,13 @@ xfs_fs_set_info(
int type,
struct qc_info *info)
{
- struct xfs_mount *mp = XFS_M(sb);
- struct qc_dqblk newlim;
+ struct xfs_mount *mp = XFS_M(sb);
+ struct qc_dqblk newlim;
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK)
return -EINVAL;
if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0)
@@ -164,7 +162,7 @@ xfs_quota_enable(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return -ENOSYS;
return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
@@ -179,10 +177,8 @@ xfs_quota_disable(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -EINVAL;
+ return -ENOSYS;
return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
}
@@ -205,11 +201,11 @@ xfs_fs_rm_xquota(
return -EINVAL;
if (uflags & FS_USER_QUOTA)
- flags |= XFS_DQ_USER;
+ flags |= XFS_QMOPT_UQUOTA;
if (uflags & FS_GROUP_QUOTA)
- flags |= XFS_DQ_GROUP;
+ flags |= XFS_QMOPT_GQUOTA;
if (uflags & FS_PROJ_QUOTA)
- flags |= XFS_DQ_PROJ;
+ flags |= XFS_QMOPT_PQUOTA;
return xfs_qm_scall_trunc_qfiles(mp, flags);
}
@@ -223,10 +219,8 @@ xfs_fs_get_dqblk(
struct xfs_mount *mp = XFS_M(sb);
xfs_dqid_t id;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
id = from_kqid(&init_user_ns, qid);
return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq);
@@ -243,10 +237,8 @@ xfs_fs_get_nextdqblk(
struct xfs_mount *mp = XFS_M(sb);
xfs_dqid_t id;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
id = from_kqid(&init_user_ns, *qid);
ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type),
@@ -269,10 +261,8 @@ xfs_fs_set_dqblk(
if (sb_rdonly(sb))
return -EROFS;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
- return -ESRCH;
+ return -ENOSYS;
return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
xfs_quota_type(qid.type), qdq);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 8eeed73928cd..858e3e9eb4a8 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -18,23 +18,28 @@
#include "xfs_log.h"
#include "xfs_refcount.h"
#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
-kmem_zone_t *xfs_cui_zone;
-kmem_zone_t *xfs_cud_zone;
+struct kmem_cache *xfs_cui_cache;
+struct kmem_cache *xfs_cud_cache;
+
+static const struct xfs_item_ops xfs_cui_item_ops;
static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_cui_log_item, cui_item);
}
-void
+STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
- kmem_cache_free(xfs_cui_zone, cuip);
+ kmem_cache_free(xfs_cui_cache, cuip);
}
/*
@@ -44,15 +49,16 @@ xfs_cui_item_free(
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the CUI.
*/
-void
+STATIC void
xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
ASSERT(atomic_read(&cuip->cui_refcount) > 0);
- if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_cui_item_free(cuip);
- }
+ if (!atomic_dec_and_test(&cuip->cui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&cuip->cui_item, 0);
+ xfs_cui_item_free(cuip);
}
@@ -123,17 +129,10 @@ xfs_cui_item_release(
xfs_cui_release(CUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_cui_item_ops = {
- .iop_size = xfs_cui_item_size,
- .iop_format = xfs_cui_item_format,
- .iop_unpin = xfs_cui_item_unpin,
- .iop_release = xfs_cui_item_release,
-};
-
/*
* Allocate and initialize an cui item with the given number of extents.
*/
-struct xfs_cui_log_item *
+STATIC struct xfs_cui_log_item *
xfs_cui_init(
struct xfs_mount *mp,
uint nextents)
@@ -146,7 +145,8 @@ xfs_cui_init(
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
0);
else
- cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
+ cuip = kmem_cache_zalloc(xfs_cui_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
cuip->cui_format.cui_nextents = nextents;
@@ -206,14 +206,24 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_cache_free(xfs_cud_zone, cudp);
+ kmem_free(cudp->cud_item.li_lv_shadow);
+ kmem_cache_free(xfs_cud_cache, cudp);
+}
+
+static struct xfs_log_item *
+xfs_cud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &CUD_ITEM(lip)->cud_cuip->cui_item;
}
static const struct xfs_item_ops xfs_cud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_cud_item_size,
.iop_format = xfs_cud_item_format,
.iop_release = xfs_cud_item_release,
+ .iop_intent = xfs_cud_item_intent,
};
static struct xfs_cud_log_item *
@@ -223,7 +233,7 @@ xfs_trans_get_cud(
{
struct xfs_cud_log_item *cudp;
- cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
&xfs_cud_item_ops);
cudp->cud_cuip = cuip;
@@ -261,7 +271,7 @@ xfs_trans_log_finish_refcount_update(
* 1.) releases the CUI and frees the CUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
@@ -271,8 +281,8 @@ xfs_trans_log_finish_refcount_update(
static int
xfs_refcount_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_refcount_intent *ra;
@@ -284,27 +294,6 @@ xfs_refcount_update_diff_items(
XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
}
-/* Get an CUI. */
-STATIC void *
-xfs_refcount_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_cui_log_item *cuip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- cuip = xfs_cui_init(tp->t_mountp, count);
- ASSERT(cuip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &cuip->cui_item);
- return cuip;
-}
-
/* Set the phys extent flags for this reverse mapping. */
static void
xfs_trans_set_refcount_flags(
@@ -328,16 +317,12 @@ xfs_trans_set_refcount_flags(
STATIC void
xfs_refcount_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_cui_log_item *cuip,
+ struct xfs_refcount_intent *refc)
{
- struct xfs_cui_log_item *cuip = intent;
- struct xfs_refcount_intent *refc;
uint next_extent;
struct xfs_phys_extent *ext;
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
@@ -354,23 +339,44 @@ xfs_refcount_update_log_item(
xfs_trans_set_refcount_flags(ext, refc->ri_type);
}
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
+ struct xfs_refcount_intent *refc;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ if (sort)
+ list_sort(mp, items, xfs_refcount_update_diff_items);
+ list_for_each_entry(refc, items, ri_list)
+ xfs_refcount_update_log_item(tp, cuip, refc);
+ return &cuip->cui_item;
+}
+
/* Get an CUD so we can process all the deferred refcount updates. */
-STATIC void *
+static struct xfs_log_item *
xfs_refcount_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_cud(tp, intent);
+ return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
}
/* Process a deferred refcount update. */
STATIC int
xfs_refcount_update_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_refcount_intent *refc;
xfs_fsblock_t new_fsb;
@@ -378,12 +384,10 @@ xfs_refcount_update_finish_item(
int error;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, done_item,
- refc->ri_type,
- refc->ri_startblock,
- refc->ri_blockcount,
- &new_fsb, &new_aglen,
- (struct xfs_btree_cur **)state);
+ error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
+ refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
+ &new_fsb, &new_aglen, state);
+
/* Did we run out of reservation? Requeue what we didn't finish. */
if (!error && new_aglen > 0) {
ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
@@ -392,28 +396,16 @@ xfs_refcount_update_finish_item(
refc->ri_blockcount = new_aglen;
return -EAGAIN;
}
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
return error;
}
-/* Clean up after processing deferred refcounts. */
-STATIC void
-xfs_refcount_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
-}
-
/* Abort all pending CUIs. */
STATIC void
xfs_refcount_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_cui_release(intent);
+ xfs_cui_release(CUI_ITEM(intent));
}
/* Cancel a deferred refcount update. */
@@ -424,47 +416,67 @@ xfs_refcount_update_cancel_item(
struct xfs_refcount_intent *refc;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_free(refc);
+ kmem_cache_free(xfs_refcount_intent_cache, refc);
}
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_refcount_update_diff_items,
.create_intent = xfs_refcount_update_create_intent,
.abort_intent = xfs_refcount_update_abort_intent,
- .log_item = xfs_refcount_update_log_item,
.create_done = xfs_refcount_update_create_done,
.finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_update_finish_cleanup,
+ .finish_cleanup = xfs_refcount_finish_one_cleanup,
.cancel_item = xfs_refcount_update_cancel_item,
};
+/* Is this recovered CUI ok? */
+static inline bool
+xfs_cui_validate_phys(
+ struct xfs_mount *mp,
+ struct xfs_phys_extent *refc)
+{
+ if (!xfs_has_reflink(mp))
+ return false;
+
+ if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
+ return false;
+
+ switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ break;
+ default:
+ return false;
+ }
+
+ return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len);
+}
+
/*
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
*/
-int
-xfs_cui_recover(
- struct xfs_trans *parent_tp,
- struct xfs_cui_log_item *cuip)
+STATIC int
+xfs_cui_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- int i;
- int error = 0;
- unsigned int refc_type;
+ struct xfs_bmbt_irec irec;
+ struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_phys_extent *refc;
- xfs_fsblock_t startblock_fsb;
- bool op_ok;
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- enum xfs_refcount_intent_type type;
+ struct xfs_mount *mp = lip->li_log->l_mp;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
- struct xfs_bmbt_irec irec;
+ unsigned int refc_type;
bool requeue_only = false;
- struct xfs_mount *mp = parent_tp->t_mountp;
-
- ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+ enum xfs_refcount_intent_type type;
+ int i;
+ int error = 0;
/*
* First check the validity of the extents described by the
@@ -472,31 +484,11 @@ xfs_cui_recover(
* just toss the CUI.
*/
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- refc = &cuip->cui_format.cui_extents[i];
- startblock_fsb = XFS_BB_TO_FSB(mp,
- XFS_FSB_TO_DADDR(mp, refc->pe_startblock));
- switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- op_ok = true;
- break;
- default:
- op_ok = false;
- break;
- }
- if (!op_ok || startblock_fsb == 0 ||
- refc->pe_len == 0 ||
- startblock_fsb >= mp->m_sb.sb_dblocks ||
- refc->pe_len >= mp->m_sb.sb_agblocks ||
- (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
- /*
- * This will pull the CUI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
- xfs_cui_release(cuip);
+ if (!xfs_cui_validate_phys(mp,
+ &cuip->cui_format.cui_extents[i])) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
return -EFSCORRUPTED;
}
}
@@ -509,7 +501,7 @@ xfs_cui_recover(
* transaction. Normally, any work that needs to be deferred
* gets attached to the same defer_ops that scheduled the
* refcount update. However, we're in log recovery here, so we
- * we use the passed in defer_ops and to finish up any work that
+ * use the passed in defer_ops and to finish up any work that
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
@@ -517,12 +509,7 @@ xfs_cui_recover(
mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
@@ -536,7 +523,9 @@ xfs_cui_recover(
type = refc_type;
break;
default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -547,6 +536,10 @@ xfs_cui_recover(
error = xfs_trans_log_finish_refcount_update(tp, cudp,
type, refc->pe_startblock, refc->pe_len,
&new_fsb, &new_len, &rcur);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
if (error)
goto abort_error;
@@ -579,14 +572,152 @@ xfs_cui_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
return error;
}
+
+STATIC bool
+xfs_cui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_cui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_cud_log_item *cudp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_phys_extent *extp;
+ unsigned int count;
+
+ count = CUI_ITEM(intent)->cui_format.cui_nextents;
+ extp = CUI_ITEM(intent)->cui_format.cui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+ atomic_set(&cuip->cui_next_extent, count);
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+ return &cuip->cui_item;
+}
+
+static const struct xfs_item_ops xfs_cui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_cui_item_size,
+ .iop_format = xfs_cui_item_format,
+ .iop_unpin = xfs_cui_item_unpin,
+ .iop_release = xfs_cui_item_release,
+ .iop_recover = xfs_cui_item_recover,
+ .iop_match = xfs_cui_item_match,
+ .iop_relog = xfs_cui_item_relog,
+};
+
+static inline void
+xfs_cui_copy_format(
+ struct xfs_cui_log_format *dst,
+ const struct xfs_cui_log_format *src)
+{
+ unsigned int i;
+
+ memcpy(dst, src, offsetof(struct xfs_cui_log_format, cui_extents));
+
+ for (i = 0; i < src->cui_nextents; i++)
+ memcpy(&dst->cui_extents[i], &src->cui_extents[i],
+ sizeof(struct xfs_phys_extent));
+}
+
+/*
+ * This routine is called to create an in-core extent refcount update
+ * item from the cui format structure which was logged on disk.
+ * It allocates an in-core cui, copies the extents from the format
+ * structure into it, and adds the cui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_cui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_cui_log_format *cui_formatp;
+ size_t len;
+
+ cui_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
+ atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
+ xfs_cui_release(cuip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_cui_item_ops = {
+ .item_type = XFS_LI_CUI,
+ .commit_pass2 = xlog_recover_cui_commit_pass2,
+};
+
+/*
+ * This routine is called when an CUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding CUI if it
+ * was still in the log. To do this it searches the AIL for the CUI with an id
+ * equal to that in the CUD format structure. If we find it we drop the CUD
+ * reference, which removes the CUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_cud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_cud_log_format *cud_formatp;
+
+ cud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_CUI, cud_formatp->cud_cui_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_cud_item_ops = {
+ .item_type = XFS_LI_CUD,
+ .commit_pass2 = xlog_recover_cud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index e47530f30489..eb0ab13682d0 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -25,7 +25,7 @@
/* kernel only CUI/CUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -33,11 +33,6 @@ struct kmem_zone;
#define XFS_CUI_MAX_FAST_EXTENTS 16
/*
- * Define CUI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_CUI_RECOVERED 1
-
-/*
* This is the "refcount update intent" log item. It is used to log
* the fact that some reverse mappings need to change. It is used in
* conjunction with the "refcount update done" log item described
@@ -51,7 +46,6 @@ struct xfs_cui_log_item {
struct xfs_log_item cui_item;
atomic_t cui_refcount;
atomic_t cui_next_extent;
- unsigned long cui_flags; /* misc flags */
struct xfs_cui_log_format cui_format;
};
@@ -74,12 +68,7 @@ struct xfs_cud_log_item {
struct xfs_cud_log_format cud_format;
};
-extern struct kmem_zone *xfs_cui_zone;
-extern struct kmem_zone *xfs_cud_zone;
-
-struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
-void xfs_cui_item_free(struct xfs_cui_log_item *);
-void xfs_cui_release(struct xfs_cui_log_item *);
-int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
+extern struct kmem_cache *xfs_cui_cache;
+extern struct kmem_cache *xfs_cud_cache;
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b0ce04ffd3cd..93bdd25680bc 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -27,7 +27,7 @@
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
-#include "xfs_sb.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
@@ -125,11 +125,10 @@
* shared blocks. If there are no shared extents, fbno and flen will
* be set to NULLAGBLOCK and 0, respectively.
*/
-int
+static int
xfs_reflink_find_shared(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
xfs_agblock_t *fbno,
@@ -140,11 +139,11 @@ xfs_reflink_find_shared(
struct xfs_btree_cur *cur;
int error;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
@@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared(
struct xfs_bmbt_irec *irec,
bool *shared)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
xfs_agblock_t agbno;
xfs_extlen_t aglen;
xfs_agblock_t fbno;
@@ -179,19 +179,20 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
trace_xfs_reflink_trim_around_shared(ip, irec);
- agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
- agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
aglen = irec->br_blockcount;
- error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
- aglen, &fbno, &flen, true);
+ error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
+ true);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -199,7 +200,9 @@ xfs_reflink_trim_around_shared(
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
- } else if (fbno == agbno) {
+ }
+
+ if (fbno == agbno) {
/*
* The start of this extent is shared. Truncate the
* mapping at the end of the shared region so that a
@@ -209,16 +212,16 @@ xfs_reflink_trim_around_shared(
irec->br_blockcount = flen;
*shared = true;
return 0;
- } else {
- /*
- * There's a shared extent midway through this extent.
- * Truncate the mapping at the start of the shared
- * extent so that a subsequent iteration starts at the
- * start of the shared region.
- */
- irec->br_blockcount = fbno - agbno;
- return 0;
}
+
+ /*
+ * There's a shared extent midway through this extent.
+ * Truncate the mapping at the start of the shared
+ * extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = fbno - agbno;
+ return 0;
}
int
@@ -340,9 +343,41 @@ xfs_find_trim_cow_extent(
return 0;
}
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool convert_now)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ int error;
+
+ /*
+ * cmap might larger than imap due to cowextsize hint.
+ */
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+ return 0;
+
+ trace_xfs_reflink_convert_cow(ip, cmap);
+
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+
+ return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
@@ -351,59 +386,37 @@ xfs_reflink_allocate_cow(
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = imap->br_startoff;
- xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
- int nimaps, error = 0;
- bool found;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
-
- error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
- return error;
- if (found)
- goto convert;
+ xfs_extlen_t resblks;
+ int nimaps;
+ int error;
+ bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
xfs_iunlock(ip, *lockmode);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- *lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, *lockmode);
+ *lockmode = 0;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
if (error)
return error;
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_trans_cancel;
+ *lockmode = XFS_ILOCK_EXCL;
- /*
- * Check for an overlapping extent again now that we dropped the ilock.
- */
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
+
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ijoin(tp, ip, 0);
+ ASSERT(cmap->br_startoff > imap->br_startoff);
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
@@ -411,7 +424,7 @@ xfs_reflink_allocate_cow(
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
&nimaps);
if (error)
- goto out_unreserve;
+ goto out_trans_cancel;
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
@@ -424,26 +437,135 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
+
convert:
- xfs_trim_extent(cmap, offset_fsb, count_fsb);
- /*
- * COW fork extents are supposed to remain unwritten until we're ready
- * to initiate a disk write. For direct I/O we are going to write the
- * data and need the conversion, but for buffered writes we're done.
- */
- if (!convert_now || cmap->br_state == XFS_EXT_NORM)
- return 0;
- trace_xfs_reflink_convert_cow(ip, cmap);
- return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nimaps;
+ int error;
+ bool found;
+
+ do {
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+ false, &tp);
+ if (error)
+ return error;
+
+ *lockmode = XFS_ILOCK_EXCL;
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+ &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+
+ if (found) {
+ xfs_trans_cancel(tp);
+ break;
+ }
+
+ ASSERT(isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK);
+
+ /*
+ * Replace delalloc reservation with an unwritten extent.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+ cmap->br_blockcount,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+ cmap, &nimaps);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ return error;
+
+ /*
+ * Allocation succeeded but the requested range was not even
+ * partially satisfied? Bail out!
+ */
+ if (nimaps == 0)
+ return -ENOSPC;
+ } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
-out_unreserve:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ int error;
+ bool found;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+ if (error || !*shared)
+ return error;
+
+ /* CoW fork has a real extent */
+ if (found)
+ return xfs_reflink_convert_unwritten(ip, imap, cmap,
+ convert_now);
+
+ /*
+ * CoW fork does not have an extent and data extent is shared.
+ * Allocate a real extent in the CoW fork.
+ */
+ if (cmap->br_startoff > imap->br_startoff)
+ return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /*
+ * CoW fork has a delalloc reservation. Replace it with a real extent.
+ * There may or may not be a data fork mapping.
+ */
+ if (isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK)
+ return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /* Shouldn't get here. */
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
/*
* Cancel CoW reservations for some block range of an inode.
*
@@ -461,7 +583,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t end_fsb,
bool cancel_real)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -496,7 +618,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_bmap_add_free(*tpp, del.br_startblock,
+ xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
/* Roll the transaction */
@@ -508,9 +630,8 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -(long)del.br_blockcount, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_unreserve_blkres(ip,
+ del.br_blockcount);
if (error)
break;
} else {
@@ -596,21 +717,21 @@ out:
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- xfs_fileoff_t *end_fsb)
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- xfs_filblks_t rlen;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
+ int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
- *end_fsb = offset_fsb;
+ *offset_fsb = end_fsb;
return 0;
}
@@ -628,47 +749,79 @@ xfs_reflink_end_cow_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ goto out_cancel;
+
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
- got.br_startoff + got.br_blockcount <= offset_fsb) {
- *end_fsb = offset_fsb;
+ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
goto out_cancel;
}
/*
- * Structure copy @got into @del, then trim @del to the range that we
- * were asked to remap. We preserve @got for the eventual CoW fork
- * deletion; from now on @del represents the mapping that we're
- * actually remapping.
- */
- del = got;
- xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
- ASSERT(del.br_blockcount > 0);
-
- /*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
- * need to skip them.
+ * need to skip them. Preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
*/
- if (!xfs_bmap_is_real_extent(&got)) {
- *end_fsb = del.br_startoff;
- goto out_cancel;
+ while (!xfs_bmap_is_written_extent(&got)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
+ goto out_cancel;
+ }
}
+ del = got;
- /* Unmap the old blocks in the data fork. */
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ /* Grab the corresponding mapping in the data fork. */
+ nmaps = 1;
+ error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+ &nmaps, 0);
if (error)
goto out_cancel;
- /* Trim the extent to whatever got unmapped. */
- xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* We can only remap the smaller of the two extent sizes. */
+ data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+ del.br_blockcount = data.br_blockcount;
+
+ trace_xfs_reflink_cow_remap_from(ip, &del);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ if (xfs_bmap_is_real_extent(&data)) {
+ /*
+ * If the extent we're remapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_refcount_decrease_extent(tp, &data);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -data.br_blockcount);
+ } else if (data.br_startblock == DELAYSTARTBLOCK) {
+ int done;
+
+ /*
+ * If the extent we're remapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, data.br_startoff,
+ data.br_blockcount, 0, 1, &done);
+ if (error)
+ goto out_cancel;
+ ASSERT(done);
+ }
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -689,7 +842,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
- *end_fsb = del.br_startoff;
+ *offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
@@ -717,11 +870,11 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
- * Walk backwards until we're out of the I/O range. The loop function
+ * Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
- * If we're being called by writeback then the the pages will still
+ * If we're being called by writeback then the pages will still
* have PageWriteback set, which prevents races with reflink remapping
* and truncate. Reflink remapping prevents races with writeback by
* taking the iolock and mmaplock before flushing the pages and
@@ -749,7 +902,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
- error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+ error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -757,22 +910,28 @@ xfs_reflink_end_cow(
}
/*
- * Free leftover CoW reservations that didn't get cleaned out.
+ * Free all CoW staging blocks that are still referenced by the ondisk refcount
+ * metadata. The ondisk metadata does not track which inode created the
+ * staging extent, so callers must ensure that there are no cached inodes with
+ * live CoW staging extents.
*/
int
xfs_reflink_recover_cow(
struct xfs_mount *mp)
{
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
int error = 0;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- error = xfs_refcount_recover_cow_leftovers(mp, agno);
- if (error)
+ for_each_perag(mp, agno, pag) {
+ error = xfs_refcount_recover_cow_leftovers(mp, pag);
+ if (error) {
+ xfs_perag_put(pag);
break;
+ }
}
return error;
@@ -882,7 +1041,7 @@ xfs_reflink_set_inode_flag(
if (!xfs_is_reflink_inode(src)) {
trace_xfs_reflink_set_inode_flag(src);
xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
- src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
xfs_ifork_init_cow(src);
} else
@@ -894,7 +1053,7 @@ xfs_reflink_set_inode_flag(
if (!xfs_is_reflink_inode(dest)) {
trace_xfs_reflink_set_inode_flag(dest);
xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
- dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
xfs_ifork_init_cow(dest);
} else
@@ -938,12 +1097,12 @@ xfs_reflink_update_dest(
if (newlen > i_size_read(VFS_I(dest))) {
trace_xfs_reflink_update_inode_size(dest, newlen);
i_size_write(VFS_I(dest), newlen);
- dest->i_d.di_size = newlen;
+ dest->i_disk_size = newlen;
}
if (cowextsize) {
- dest->i_d.di_cowextsize = cowextsize;
- dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ dest->i_cowextsize = cowextsize;
+ dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
}
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
@@ -972,7 +1131,7 @@ xfs_reflink_ag_has_free_space(
struct xfs_perag *pag;
int error = 0;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
pag = xfs_perag_get(mp, agno);
@@ -984,127 +1143,208 @@ xfs_reflink_ag_has_free_space(
}
/*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file. The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
*/
STATIC int
xfs_reflink_remap_extent(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec,
- xfs_fileoff_t destoff,
+ struct xfs_bmbt_irec *dmap,
xfs_off_t new_isize)
{
+ struct xfs_bmbt_irec smap;
struct xfs_mount *mp = ip->i_mount;
- bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
- unsigned int resblks;
- struct xfs_bmbt_irec uirec;
- xfs_filblks_t rlen;
- xfs_filblks_t unmap_len;
xfs_off_t newlen;
+ int64_t qdelta = 0;
+ unsigned int resblks;
+ bool quota_reserved = true;
+ bool smap_real;
+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int iext_delta = 0;
+ int nimaps;
int error;
- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
- /* No reflinking if we're low on space */
- if (real_extent) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- if (error)
- goto out;
+ /*
+ * Start a rolling transaction to switch the mappings.
+ *
+ * Adding a written extent to the extent map can cause a bmbt split,
+ * and removing a mapped extent from the extent can cause a bmbt split.
+ * The two operations cannot both cause a split since they operate on
+ * the same index in the bmap btree, so we only need a reservation for
+ * one bmbt split if either thing is happening. However, we haven't
+ * locked the inode yet, so we reserve assuming this is the case.
+ *
+ * The first allocation call tries to reserve enough space to handle
+ * mapping dmap into a sparse part of the file plus the bmbt split. We
+ * haven't locked the inode or read the existing mapping yet, so we do
+ * not know for sure that we need the space. This should succeed most
+ * of the time.
+ *
+ * If the first attempt fails, try again but reserving only enough
+ * space to handle a bmbt split. This is the hard minimum requirement,
+ * and we revisit quota reservations later when we know more about what
+ * we're remapping.
+ */
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks + dmap->br_blockcount, 0, false, &tp);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ quota_reserved = false;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks, 0, false, &tp);
}
-
- /* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ /*
+ * Read what's currently mapped in the destination file into smap.
+ * If smap isn't a hole, we will have to remove it before we can add
+ * dmap to the destination file.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+ &smap, &nimaps, 0);
+ if (error)
+ goto out_cancel;
+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+ smap_real = xfs_bmap_is_real_extent(&smap);
- /* If we're not just clearing space, then do we have enough quota? */
- if (real_extent) {
- error = xfs_trans_reserve_quota_nblks(tp, ip,
- irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_cancel;
+ /*
+ * We can only remap as many blocks as the smaller of the two extent
+ * maps, because we can only remap one extent at a time.
+ */
+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
+
+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+ /*
+ * Two extents mapped to the same physical block must not have
+ * different states; that's filesystem corruption. Move on to the next
+ * extent if they're both holes or both the same physical extent.
+ */
+ if (dmap->br_startblock == smap.br_startblock) {
+ if (dmap->br_state != smap.br_state)
+ error = -EFSCORRUPTED;
+ goto out_cancel;
}
- trace_xfs_reflink_remap(ip, irec->br_startoff,
- irec->br_blockcount, irec->br_startblock);
+ /* If both extents are unwritten, leave them alone. */
+ if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+ smap.br_state == XFS_EXT_UNWRITTEN)
+ goto out_cancel;
- /* Unmap the old blocks in the data fork. */
- rlen = unmap_len;
- while (rlen) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+ /* No reflinking if the AG of the dest mapping is low on space. */
+ if (dmap_written) {
+ error = xfs_reflink_ag_has_free_space(mp,
+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
if (error)
goto out_cancel;
+ }
- /*
- * Trim the extent to whatever got unmapped.
- * Remember, bunmapi works backwards.
- */
- uirec.br_startblock = irec->br_startblock + rlen;
- uirec.br_startoff = irec->br_startoff + rlen;
- uirec.br_blockcount = unmap_len - rlen;
- unmap_len = rlen;
-
- /* If this isn't a real mapping, we're done. */
- if (!real_extent || uirec.br_blockcount == 0)
- goto next_extent;
+ /*
+ * Increase quota reservation if we think the quota block counter for
+ * this file could increase.
+ *
+ * If we are mapping a written extent into the file, we need to have
+ * enough quota block count reservation to handle the blocks in that
+ * extent. We log only the delta to the quota block counts, so if the
+ * extent we're unmapping also has blocks allocated to it, we don't
+ * need a quota reservation for the extent itself.
+ *
+ * Note that if we're replacing a delalloc reservation with a written
+ * extent, we have to take the full quota reservation because removing
+ * the delalloc reservation gives the block count back to the quota
+ * count. This is suboptimal, but the VFS flushed the dest range
+ * before we started. That should have removed all the delalloc
+ * reservations, but we code defensively.
+ *
+ * xfs_trans_alloc_inode above already tried to grab an even larger
+ * quota reservation, and kicked off a blockgc scan if it couldn't.
+ * If we can't get a potentially smaller quota reservation now, we're
+ * done.
+ */
+ if (!quota_reserved && !smap_real && dmap_written) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip,
+ dmap->br_blockcount, 0, false);
+ if (error)
+ goto out_cancel;
+ }
- trace_xfs_reflink_remap(ip, uirec.br_startoff,
- uirec.br_blockcount, uirec.br_startblock);
+ if (smap_real)
+ ++iext_delta;
- /* Update the refcount tree */
- xfs_refcount_increase_extent(tp, &uirec);
+ if (dmap_written)
+ ++iext_delta;
- /* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ if (error)
+ goto out_cancel;
- /* Update quota accounting. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- uirec.br_blockcount);
-
- /* Update dest isize if needed. */
- newlen = XFS_FSB_TO_B(mp,
- uirec.br_startoff + uirec.br_blockcount);
- newlen = min_t(xfs_off_t, newlen, new_isize);
- if (newlen > i_size_read(VFS_I(ip))) {
- trace_xfs_reflink_update_inode_size(ip, newlen);
- i_size_write(VFS_I(ip), newlen);
- ip->i_d.di_size = newlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
+ if (smap_real) {
+ /*
+ * If the extent we're unmapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_refcount_decrease_extent(tp, &smap);
+ qdelta -= smap.br_blockcount;
+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+ int done;
-next_extent:
- /* Process all the deferred stuff. */
- error = xfs_defer_finish(&tp);
+ /*
+ * If the extent we're unmapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+ smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
+ ASSERT(done);
+ }
+
+ /*
+ * If the extent we're sharing is backed by written storage, increase
+ * its refcount and map it into the file.
+ */
+ if (dmap_written) {
+ xfs_refcount_increase_extent(tp, dmap);
+ xfs_bmap_map_extent(tp, ip, dmap);
+ qdelta += dmap->br_blockcount;
+ }
+
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+
+ /* Update dest isize if needed. */
+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+ newlen = min_t(xfs_off_t, newlen, new_isize);
+ if (newlen > i_size_read(VFS_I(ip))) {
+ trace_xfs_reflink_update_inode_size(ip, newlen);
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_disk_size = newlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+ /* Commit everything and unlock. */
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
- return 0;
+ goto out_unlock;
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
return error;
}
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
@@ -1115,25 +1355,22 @@ xfs_reflink_remap_blocks(
loff_t *remapped)
{
struct xfs_bmbt_irec imap;
- xfs_fileoff_t srcoff;
- xfs_fileoff_t destoff;
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
xfs_filblks_t len;
- xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
- srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
- len = XFS_B_TO_FSB(src->i_mount, remap_len);
+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+ XFS_MAX_FILEOFF);
- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
- while (len) {
- uint lock_mode;
+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
- dest, destoff);
+ while (len > 0) {
+ unsigned int lock_mode;
/* Read extent from the source file */
nimaps = 1;
@@ -1142,18 +1379,25 @@ xfs_reflink_remap_blocks(
xfs_iunlock(src, lock_mode);
if (error)
break;
- ASSERT(nimaps == 1);
-
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
- &imap);
+ /*
+ * The caller supposedly flushed all dirty pages in the source
+ * file range, which means that writeback should have allocated
+ * or deleted all delalloc reservations in that range. If we
+ * find one, that's a good sign that something is seriously
+ * wrong here.
+ */
+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = -EFSCORRUPTED;
+ break;
+ }
- /* Translate imap into the destination file. */
- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
- imap.br_startoff += destoff - srcoff;
+ trace_xfs_reflink_remap_extent_src(src, &imap);
- /* Clear dest from destoff to the end of imap and map it in. */
- error = xfs_reflink_remap_extent(dest, &imap, destoff,
- new_isize);
+ /* Remap into the destination file at the given offset. */
+ imap.br_startoff = destoff;
+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
if (error)
break;
@@ -1163,10 +1407,10 @@ xfs_reflink_remap_blocks(
}
/* Advance drange/srange */
- srcoff += range_len;
- destoff += range_len;
- len -= range_len;
- remapped_len += range_len;
+ srcoff += imap.br_blockcount;
+ destoff += imap.br_blockcount;
+ len -= imap.br_blockcount;
+ remapped_len += imap.br_blockcount;
}
if (error)
@@ -1177,81 +1421,6 @@ xfs_reflink_remap_blocks(
}
/*
- * Grab the exclusive iolock for a data copy from src to dest, making sure to
- * abide vfs locking order (lowest pointer value goes first) and breaking the
- * layout leases before proceeding. The loop is needed because we cannot call
- * the blocking break_layout() with the iolocks held, and therefore have to
- * back out both locks.
- */
-static int
-xfs_iolock_two_inodes_and_break_layout(
- struct inode *src,
- struct inode *dest)
-{
- int error;
-
- if (src > dest)
- swap(src, dest);
-
-retry:
- /* Wait to break both inodes' layouts before we start locking. */
- error = break_layout(src, true);
- if (error)
- return error;
- if (src != dest) {
- error = break_layout(dest, true);
- if (error)
- return error;
- }
-
- /* Lock one inode and make sure nobody got in and leased it. */
- inode_lock(src);
- error = break_layout(src, false);
- if (error) {
- inode_unlock(src);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- if (src == dest)
- return 0;
-
- /* Lock the other inode and make sure nobody got in and leased it. */
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
- error = break_layout(dest, false);
- if (error) {
- inode_unlock(src);
- inode_unlock(dest);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- return 0;
-}
-
-/* Unlock both inodes after they've been prepped for a range clone. */
-void
-xfs_reflink_remap_unlock(
- struct file *file_in,
- struct file *file_out)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
-
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock(inode_in);
-}
-
-/*
* If we're reflinking to a point past the destination file's EOF, we must
* zero any speculative post-EOF preallocations that sit between the old EOF
* and the destination file offset.
@@ -1267,8 +1436,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_buffered_write_iomap_ops);
+ return xfs_zero_range(ip, isize, pos - isize, NULL);
}
/*
@@ -1313,18 +1481,12 @@ xfs_reflink_remap_prep(
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
- ssize_t ret;
+ int ret;
/* Lock both files against IO */
- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+ ret = xfs_ilock2_io_mmap(src, dest);
if (ret)
return ret;
- if (same_inode)
- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
- XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
@@ -1332,13 +1494,17 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
- if (ret < 0 || *len == 0)
+ if (!IS_DAX(inode_in))
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags);
+ else
+ ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, &xfs_read_iomap_ops);
+ if (ret || *len == 0)
goto out_unlock;
/* Attach dquots to dest inode before changing block map */
@@ -1373,9 +1539,9 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
- return 1;
+ return 0;
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
return ret;
}
@@ -1389,36 +1555,37 @@ xfs_reflink_inode_has_shared_extents(
struct xfs_bmbt_irec got;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
struct xfs_iext_cursor icur;
bool found;
int error;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
- if (error)
- return error;
- }
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
*has_shared = false;
found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
+ struct xfs_perag *pag;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
goto next;
- agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
aglen = got.br_blockcount;
-
- error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
+ error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
&rbno, &rlen, false);
+ xfs_perag_put(pag);
if (error)
return error;
+
/* Is there still a shared block here? */
if (rbno != NULLAGBLOCK) {
*has_shared = true;
@@ -1462,7 +1629,7 @@ xfs_reflink_clear_inode_flag(
/* Clear the inode flag. */
trace_xfs_reflink_unset_inode_flag(ip);
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
@@ -1530,7 +1697,9 @@ xfs_reflink_unshare(
&xfs_buffered_write_iomap_ops);
if (error)
goto out;
- error = filemap_write_and_wait(inode->i_mapping);
+
+ error = filemap_write_and_wait_range(inode->i_mapping, offset,
+ offset + len - 1);
if (error)
goto out;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 3e4fd46373ab..65c5dfe17ecf 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -8,8 +8,7 @@
static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
{
- return ip->i_mount->m_always_cow &&
- xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+ return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
}
static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
@@ -17,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
-extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
- xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
@@ -56,7 +52,5 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
loff_t *remapped);
extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
xfs_extlen_t cowextsize, unsigned int remap_flags);
-extern void xfs_reflink_remap_unlock(struct file *file_in,
- struct file *file_out);
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 4911b68f95dd..534504ede1a3 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -18,23 +18,28 @@
#include "xfs_log.h"
#include "xfs_rmap.h"
#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
-kmem_zone_t *xfs_rui_zone;
-kmem_zone_t *xfs_rud_zone;
+struct kmem_cache *xfs_rui_cache;
+struct kmem_cache *xfs_rud_cache;
+
+static const struct xfs_item_ops xfs_rui_item_ops;
static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_rui_log_item, rui_item);
}
-void
+STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
- kmem_cache_free(xfs_rui_zone, ruip);
+ kmem_cache_free(xfs_rui_cache, ruip);
}
/*
@@ -44,15 +49,16 @@ xfs_rui_item_free(
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the RUI.
*/
-void
+STATIC void
xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
ASSERT(atomic_read(&ruip->rui_refcount) > 0);
- if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_rui_item_free(ruip);
- }
+ if (!atomic_dec_and_test(&ruip->rui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&ruip->rui_item, 0);
+ xfs_rui_item_free(ruip);
}
STATIC void
@@ -122,17 +128,10 @@ xfs_rui_item_release(
xfs_rui_release(RUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_rui_item_ops = {
- .iop_size = xfs_rui_item_size,
- .iop_format = xfs_rui_item_format,
- .iop_unpin = xfs_rui_item_unpin,
- .iop_release = xfs_rui_item_release,
-};
-
/*
* Allocate and initialize an rui item with the given number of extents.
*/
-struct xfs_rui_log_item *
+STATIC struct xfs_rui_log_item *
xfs_rui_init(
struct xfs_mount *mp,
uint nextents)
@@ -144,7 +143,8 @@ xfs_rui_init(
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
+ ruip = kmem_cache_zalloc(xfs_rui_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
@@ -155,31 +155,6 @@ xfs_rui_init(
return ruip;
}
-/*
- * Copy an RUI format buffer from the given buf, and into the destination
- * RUI format structure. The RUI/RUD items were designed not to need any
- * special alignment handling.
- */
-int
-xfs_rui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_rui_log_format *dst_rui_fmt)
-{
- struct xfs_rui_log_format *src_rui_fmt;
- uint len;
-
- src_rui_fmt = buf->i_addr;
- len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
-
- if (buf->i_len != len) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
- return -EFSCORRUPTED;
- }
-
- memcpy(dst_rui_fmt, src_rui_fmt, len);
- return 0;
-}
-
static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_rud_log_item, rud_item);
@@ -229,14 +204,24 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_cache_free(xfs_rud_zone, rudp);
+ kmem_free(rudp->rud_item.li_lv_shadow);
+ kmem_cache_free(xfs_rud_cache, rudp);
+}
+
+static struct xfs_log_item *
+xfs_rud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &RUD_ITEM(lip)->rud_ruip->rui_item;
}
static const struct xfs_item_ops xfs_rud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_rud_item_size,
.iop_format = xfs_rud_item_format,
.iop_release = xfs_rud_item_release,
+ .iop_intent = xfs_rud_item_intent,
};
static struct xfs_rud_log_item *
@@ -246,7 +231,7 @@ xfs_trans_get_rud(
{
struct xfs_rud_log_item *rudp;
- rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
&xfs_rud_item_ops);
rudp->rud_ruip = ruip;
@@ -329,7 +314,7 @@ xfs_trans_log_finish_rmap_update(
* 1.) releases the RUI and frees the RUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
@@ -339,8 +324,8 @@ xfs_trans_log_finish_rmap_update(
static int
xfs_rmap_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_rmap_intent *ra;
@@ -352,41 +337,16 @@ xfs_rmap_update_diff_items(
XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
}
-/* Get an RUI. */
-STATIC void *
-xfs_rmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_rui_log_item *ruip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- ruip = xfs_rui_init(tp->t_mountp, count);
- ASSERT(ruip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &ruip->rui_item);
- return ruip;
-}
-
/* Log rmap updates in the intent item. */
STATIC void
xfs_rmap_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_rui_log_item *ruip,
+ struct xfs_rmap_intent *rmap)
{
- struct xfs_rui_log_item *ruip = intent;
- struct xfs_rmap_intent *rmap;
uint next_extent;
struct xfs_map_extent *map;
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
@@ -406,58 +366,64 @@ xfs_rmap_update_log_item(
rmap->ri_bmap.br_state);
}
+static struct xfs_log_item *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
+ struct xfs_rmap_intent *rmap;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ if (sort)
+ list_sort(mp, items, xfs_rmap_update_diff_items);
+ list_for_each_entry(rmap, items, ri_list)
+ xfs_rmap_update_log_item(tp, ruip, rmap);
+ return &ruip->rui_item;
+}
+
/* Get an RUD so we can process all the deferred rmap updates. */
-STATIC void *
+static struct xfs_log_item *
xfs_rmap_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_rud(tp, intent);
+ return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
}
/* Process a deferred rmap update. */
STATIC int
xfs_rmap_update_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_rmap_intent *rmap;
int error;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, done_item,
- rmap->ri_type,
- rmap->ri_owner, rmap->ri_whichfork,
- rmap->ri_bmap.br_startoff,
- rmap->ri_bmap.br_startblock,
- rmap->ri_bmap.br_blockcount,
- rmap->ri_bmap.br_state,
- (struct xfs_btree_cur **)state);
- kmem_free(rmap);
+ error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done),
+ rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork,
+ rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
+ rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
+ state);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
return error;
}
-/* Clean up after processing deferred rmaps. */
-STATIC void
-xfs_rmap_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
-}
-
/* Abort all pending RUIs. */
STATIC void
xfs_rmap_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_rui_release(intent);
+ xfs_rui_release(RUI_ITEM(intent));
}
/* Cancel a deferred rmap update. */
@@ -468,43 +434,75 @@ xfs_rmap_update_cancel_item(
struct xfs_rmap_intent *rmap;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_free(rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, rmap);
}
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_rmap_update_diff_items,
.create_intent = xfs_rmap_update_create_intent,
.abort_intent = xfs_rmap_update_abort_intent,
- .log_item = xfs_rmap_update_log_item,
.create_done = xfs_rmap_update_create_done,
.finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_update_finish_cleanup,
+ .finish_cleanup = xfs_rmap_finish_one_cleanup,
.cancel_item = xfs_rmap_update_cancel_item,
};
+/* Is this recovered RUI ok? */
+static inline bool
+xfs_rui_validate_map(
+ struct xfs_mount *mp,
+ struct xfs_map_extent *rmap)
+{
+ if (!xfs_has_rmapbt(mp))
+ return false;
+
+ if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
+ return false;
+
+ switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ case XFS_RMAP_EXTENT_MAP:
+ case XFS_RMAP_EXTENT_MAP_SHARED:
+ case XFS_RMAP_EXTENT_UNMAP:
+ case XFS_RMAP_EXTENT_UNMAP_SHARED:
+ case XFS_RMAP_EXTENT_CONVERT:
+ case XFS_RMAP_EXTENT_CONVERT_SHARED:
+ case XFS_RMAP_EXTENT_ALLOC:
+ case XFS_RMAP_EXTENT_FREE:
+ break;
+ default:
+ return false;
+ }
+
+ if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) &&
+ !xfs_verify_ino(mp, rmap->me_owner))
+ return false;
+
+ if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len))
+ return false;
+
+ return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len);
+}
+
/*
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
*/
-int
-xfs_rui_recover(
- struct xfs_mount *mp,
- struct xfs_rui_log_item *ruip)
+STATIC int
+xfs_rui_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- int i;
- int error = 0;
+ struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_map_extent *rmap;
- xfs_fsblock_t startblock_fsb;
- bool op_ok;
struct xfs_rud_log_item *rudp;
- enum xfs_rmap_intent_type type;
- int whichfork;
- xfs_exntst_t state;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
-
- ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags));
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ enum xfs_rmap_intent_type type;
+ xfs_exntst_t state;
+ int i;
+ int whichfork;
+ int error = 0;
/*
* First check the validity of the extents described by the
@@ -512,35 +510,11 @@ xfs_rui_recover(
* just toss the RUI.
*/
for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
- rmap = &ruip->rui_format.rui_extents[i];
- startblock_fsb = XFS_BB_TO_FSB(mp,
- XFS_FSB_TO_DADDR(mp, rmap->me_startblock));
- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
- case XFS_RMAP_EXTENT_MAP:
- case XFS_RMAP_EXTENT_MAP_SHARED:
- case XFS_RMAP_EXTENT_UNMAP:
- case XFS_RMAP_EXTENT_UNMAP_SHARED:
- case XFS_RMAP_EXTENT_CONVERT:
- case XFS_RMAP_EXTENT_CONVERT_SHARED:
- case XFS_RMAP_EXTENT_ALLOC:
- case XFS_RMAP_EXTENT_FREE:
- op_ok = true;
- break;
- default:
- op_ok = false;
- break;
- }
- if (!op_ok || startblock_fsb == 0 ||
- rmap->me_len == 0 ||
- startblock_fsb >= mp->m_sb.sb_dblocks ||
- rmap->me_len >= mp->m_sb.sb_agblocks ||
- (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the RUI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
- xfs_rui_release(ruip);
+ if (!xfs_rui_validate_map(mp,
+ &ruip->rui_format.rui_extents[i])) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
return -EFSCORRUPTED;
}
}
@@ -583,7 +557,9 @@ xfs_rui_recover(
type = XFS_RMAP_FREE;
break;
default:
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -591,18 +567,161 @@ xfs_rui_recover(
rmap->me_owner, whichfork,
rmap->me_startoff, rmap->me_startblock,
rmap->me_len, state, &rcur);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ rmap, sizeof(*rmap));
if (error)
goto abort_error;
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
+
+STATIC bool
+xfs_rui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_rui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_rud_log_item *rudp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = RUI_ITEM(intent)->rui_format.rui_nextents;
+ extp = RUI_ITEM(intent)->rui_format.rui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+ atomic_set(&ruip->rui_next_extent, count);
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+ return &ruip->rui_item;
+}
+
+static const struct xfs_item_ops xfs_rui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_rui_item_size,
+ .iop_format = xfs_rui_item_format,
+ .iop_unpin = xfs_rui_item_unpin,
+ .iop_release = xfs_rui_item_release,
+ .iop_recover = xfs_rui_item_recover,
+ .iop_match = xfs_rui_item_match,
+ .iop_relog = xfs_rui_item_relog,
+};
+
+static inline void
+xfs_rui_copy_format(
+ struct xfs_rui_log_format *dst,
+ const struct xfs_rui_log_format *src)
+{
+ unsigned int i;
+
+ memcpy(dst, src, offsetof(struct xfs_rui_log_format, rui_extents));
+
+ for (i = 0; i < src->rui_nextents; i++)
+ memcpy(&dst->rui_extents[i], &src->rui_extents[i],
+ sizeof(struct xfs_map_extent));
+}
+
+/*
+ * This routine is called to create an in-core extent rmap update
+ * item from the rui format structure which was logged on disk.
+ * It allocates an in-core rui, copies the extents from the format
+ * structure into it, and adds the rui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_rui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_rui_log_format *rui_formatp;
+ size_t len;
+
+ rui_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
+ if (item->ri_buf[0].i_len != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+ xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
+ atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
+ xfs_rui_release(ruip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_rui_item_ops = {
+ .item_type = XFS_LI_RUI,
+ .commit_pass2 = xlog_recover_rui_commit_pass2,
+};
+
+/*
+ * This routine is called when an RUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding RUI if it
+ * was still in the log. To do this it searches the AIL for the RUI with an id
+ * equal to that in the RUD format structure. If we find it we drop the RUD
+ * reference, which removes the RUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_rud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_rud_log_format *rud_formatp;
+
+ rud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ rud_formatp, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_rud_item_ops = {
+ .item_type = XFS_LI_RUD,
+ .commit_pass2 = xlog_recover_rud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 8708e4a5aa5c..802e5119eaca 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -28,7 +28,7 @@
/* kernel only RUI/RUD definitions */
struct xfs_mount;
-struct kmem_zone;
+struct kmem_cache;
/*
* Max number of extents in fast allocation path.
@@ -36,11 +36,6 @@ struct kmem_zone;
#define XFS_RUI_MAX_FAST_EXTENTS 16
/*
- * Define RUI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_RUI_RECOVERED 1
-
-/*
* This is the "rmap update intent" log item. It is used to log the fact that
* some reverse mappings need to change. It is used in conjunction with the
* "rmap update done" log item described below.
@@ -52,7 +47,6 @@ struct xfs_rui_log_item {
struct xfs_log_item rui_item;
atomic_t rui_refcount;
atomic_t rui_next_extent;
- unsigned long rui_flags; /* misc flags */
struct xfs_rui_log_format rui_format;
};
@@ -74,14 +68,7 @@ struct xfs_rud_log_item {
struct xfs_rud_log_format rud_format;
};
-extern struct kmem_zone *xfs_rui_zone;
-extern struct kmem_zone *xfs_rud_zone;
-
-struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
-int xfs_rui_copy_format(struct xfs_log_iovec *buf,
- struct xfs_rui_log_format *dst_rui_fmt);
-void xfs_rui_item_free(struct xfs_rui_log_item *);
-void xfs_rui_release(struct xfs_rui_log_item *);
-int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
+extern struct kmem_cache *xfs_rui_cache;
+extern struct kmem_cache *xfs_rud_cache;
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6209e7b6b895..292d5e54a92c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -18,7 +18,7 @@
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_sb.h"
/*
* Read and return the summary information for a given extent size,
@@ -32,7 +32,7 @@ xfs_rtget_summary(
xfs_trans_t *tp, /* transaction pointer */
int log, /* log2 of extent size */
xfs_rtblock_t bbno, /* bitmap block number */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_suminfo_t *sum) /* out: summary info for this block */
{
@@ -50,7 +50,7 @@ xfs_rtany_summary(
int low, /* low log2 extent size */
int high, /* high log2 extent size */
xfs_rtblock_t bbno, /* bitmap block number */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
int *stat) /* out: any good extents here? */
{
@@ -104,7 +104,7 @@ xfs_rtcopy_summary(
xfs_trans_t *tp) /* transaction pointer */
{
xfs_rtblock_t bbno; /* bitmap block number */
- xfs_buf_t *bp; /* summary buffer */
+ struct xfs_buf *bp; /* summary buffer */
int error; /* error return value */
int log; /* summary level number (log length) */
xfs_suminfo_t sum; /* summary data */
@@ -144,7 +144,7 @@ xfs_rtallocate_range(
xfs_trans_t *tp, /* transaction pointer */
xfs_rtblock_t start, /* start block to allocate */
xfs_extlen_t len, /* length to allocate */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb) /* in/out: summary block number */
{
xfs_rtblock_t end; /* end of the allocated extent */
@@ -226,7 +226,7 @@ xfs_rtallocate_extent_block(
xfs_extlen_t maxlen, /* maximum length to allocate */
xfs_extlen_t *len, /* out: actual length allocated */
xfs_rtblock_t *nextp, /* out: next block to try */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_extlen_t prod, /* extent product factor */
xfs_rtblock_t *rtblock) /* out: start block allocated */
@@ -247,6 +247,9 @@ xfs_rtallocate_extent_block(
end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
i <= end;
i++) {
+ /* Make sure we don't scan off the end of the rt volume. */
+ maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+
/*
* See if there's a free extent of maxlen starting at i.
* If it's not so then next will contain the first non-free.
@@ -342,7 +345,7 @@ xfs_rtallocate_extent_exact(
xfs_extlen_t minlen, /* minimum length to allocate */
xfs_extlen_t maxlen, /* maximum length to allocate */
xfs_extlen_t *len, /* out: actual length allocated */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_extlen_t prod, /* extent product factor */
xfs_rtblock_t *rtblock) /* out: start block allocated */
@@ -421,7 +424,7 @@ xfs_rtallocate_extent_near(
xfs_extlen_t minlen, /* minimum length to allocate */
xfs_extlen_t maxlen, /* maximum length to allocate */
xfs_extlen_t *len, /* out: actual length allocated */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_extlen_t prod, /* extent product factor */
xfs_rtblock_t *rtblock) /* out: start block allocated */
@@ -442,6 +445,14 @@ xfs_rtallocate_extent_near(
*/
if (bno >= mp->m_sb.sb_rextents)
bno = mp->m_sb.sb_rextents - 1;
+
+ /* Make sure we don't run off the end of the rt volume. */
+ maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+ if (maxlen < minlen) {
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+
/*
* Try the exact allocation first.
*/
@@ -615,7 +626,7 @@ xfs_rtallocate_extent_size(
xfs_extlen_t minlen, /* minimum length to allocate */
xfs_extlen_t maxlen, /* maximum length to allocate */
xfs_extlen_t *len, /* out: actual length allocated */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ struct xfs_buf **rbpp, /* in/out: summary block buffer */
xfs_fsblock_t *rsb, /* in/out: summary block number */
xfs_extlen_t prod, /* extent product factor */
xfs_rtblock_t *rtblock) /* out: start block allocated */
@@ -767,8 +778,14 @@ xfs_growfs_rt_alloc(
struct xfs_bmbt_irec map; /* block map output */
int nmap; /* number of block maps */
int resblks; /* space reservation */
+ enum xfs_blft buf_type;
struct xfs_trans *tp;
+ if (ip == mp->m_rsumip)
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ else
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+
/*
* Allocate space to the file, as necessary.
*/
@@ -787,6 +804,14 @@ xfs_growfs_rt_alloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
/*
* Allocate blocks to the bitmap file.
*/
@@ -830,6 +855,9 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0, &bp);
if (error)
goto out_trans_cancel;
+
+ xfs_trans_buf_set_type(tp, bp, buf_type);
+ bp->b_ops = &xfs_rtbuf_ops;
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
@@ -862,7 +890,7 @@ xfs_alloc_rsum_cache(
* lower bound on the minimum level with any free extents. We can
* continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
+ mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
if (!mp->m_rsum_cache)
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -880,7 +908,7 @@ xfs_growfs_rt(
xfs_growfs_rt_t *in) /* growfs rt input struct */
{
xfs_rtblock_t bmbno; /* bitmap block number */
- xfs_buf_t *bp; /* temporary buffer */
+ struct xfs_buf *bp; /* temporary buffer */
int error; /* error return value */
xfs_mount_t *nmp; /* new (fake) mount structure */
xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
@@ -898,16 +926,40 @@ xfs_growfs_rt(
uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
- /*
- * Initial error checking.
- */
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
- (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
- (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+
+ /* Needs to have been mounted with an rt device. */
+ if (!XFS_IS_REALTIME_MOUNT(mp))
+ return -EINVAL;
+ /*
+ * Mount should fail if the rt bitmap/summary files don't load, but
+ * we'll check anyway.
+ */
+ if (!mp->m_rbmip || !mp->m_rsumip)
+ return -EINVAL;
+
+ /* Shrink not supported. */
+ if (in->newblocks <= sbp->sb_rblocks)
+ return -EINVAL;
+
+ /* Can only change rt extent size when adding rt volume. */
+ if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize)
+ return -EINVAL;
+
+ /* Range check the extent size. */
+ if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE ||
+ XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
return -EINVAL;
- if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
+
+ /* Unsupported realtime features. */
+ if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+ return -EOPNOTSUPP;
+
+ nrblocks = in->newblocks;
+ error = xfs_sb_validate_fsb_count(sbp, nrblocks);
+ if (error)
return error;
/*
* Read in the last block of the device, make sure it exists.
@@ -941,8 +993,8 @@ xfs_growfs_rt(
* Get the old block counts for bitmap and summary inodes.
* These can't change since other growfs callers are locked out.
*/
- rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
- rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+ rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_disk_size);
+ rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_disk_size);
/*
* Allocate space to the bitmap and summary files, as necessary.
*/
@@ -971,7 +1023,8 @@ xfs_growfs_rt(
((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
bmbno < nrbmblocks;
bmbno++) {
- xfs_trans_t *tp;
+ struct xfs_trans *tp;
+ xfs_rfsblock_t nrblocks_step;
*nmp = *mp;
nsbp = &nmp->m_sb;
@@ -980,10 +1033,9 @@ xfs_growfs_rt(
*/
nsbp->sb_rextsize = in->extsize;
nsbp->sb_rbmblocks = bmbno + 1;
- nsbp->sb_rblocks =
- XFS_RTMIN(nrblocks,
- nsbp->sb_rbmblocks * NBBY *
- nsbp->sb_blocksize * nsbp->sb_rextsize);
+ nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
+ nsbp->sb_rextsize;
+ nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
nsbp->sb_rextents = nsbp->sb_rblocks;
do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
ASSERT(nsbp->sb_rextents != 0);
@@ -1004,23 +1056,29 @@ xfs_growfs_rt(
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
/*
- * Update the bitmap inode's size.
+ * Update the bitmap inode's size ondisk and incore. We need
+ * to update the incore size so that inode inactivation won't
+ * punch what it thinks are "posteof" blocks.
*/
- mp->m_rbmip->i_d.di_size =
+ mp->m_rbmip->i_disk_size =
nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+ i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
/*
* Get the summary inode into the transaction.
*/
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
- * Update the summary inode's size.
+ * Update the summary inode's size. We need to update the
+ * incore size so that inode inactivation won't punch what it
+ * thinks are "posteof" blocks.
*/
- mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+ mp->m_rsumip->i_disk_size = nmp->m_rsumsize;
+ i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_disk_size);
xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
/*
* Copy summary data from old to new sizes.
@@ -1075,8 +1133,17 @@ error_cancel:
error = xfs_trans_commit(tp);
if (error)
break;
+
+ /* Ensure the mount RT feature flag is now set. */
+ mp->m_features |= XFS_FEAT_REALTIME;
}
+ if (error)
+ goto out_free;
+
+ /* Update secondary superblocks now the physical grow has completed */
+ error = xfs_update_secondary_sbs(mp);
+out_free:
/*
* Free the fake mp structure.
*/
@@ -1119,7 +1186,7 @@ xfs_rtallocate_extent(
int error; /* error value */
xfs_rtblock_t r; /* result allocated block */
xfs_fsblock_t sb; /* summary file block number */
- xfs_buf_t *sumbp; /* summary file block buffer */
+ struct xfs_buf *sumbp; /* summary file block buffer */
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
ASSERT(minlen > 0 && minlen <= maxlen);
@@ -1220,6 +1287,44 @@ xfs_rtmount_init(
return 0;
}
+static int
+xfs_rtalloc_count_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ uint64_t *valp = priv;
+
+ *valp += rec->ar_extcount;
+ return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+ struct xfs_mount *mp)
+{
+ uint64_t val = 0;
+ int error;
+
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+ &val);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_frextents = val;
+ spin_unlock(&mp->m_sb_lock);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ return 0;
+}
+
/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
@@ -1281,8 +1386,8 @@ xfs_rtpick_extent(
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
- if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
- mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
+ mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
*seqp = 0;
}
seq = *seqp;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 93e77b221355..62c7ad79cbb6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,9 +22,10 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
- void *priv);
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv);
#ifdef CONFIG_XFS_RT
/*
@@ -115,36 +116,37 @@ int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len, int val);
int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
int log, xfs_rtblock_t bbno, int delta,
- xfs_buf_t **rbpp, xfs_fsblock_t *rsb,
+ struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
xfs_suminfo_t *sum);
int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
- xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
+ xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
xfs_fsblock_t *rsb);
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_trans *tp,
- struct xfs_rtalloc_rec *low_rec,
- struct xfs_rtalloc_rec *high_rec,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-int xfs_rtalloc_query_all(struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *low_rec,
+ const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
# define xfs_verify_rtbno(m, r) (false)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 113883c4f202..90a77cd3ebad 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
uint64_t xs_xstrat_bytes = 0;
uint64_t xs_write_bytes = 0;
uint64_t xs_read_bytes = 0;
+ uint64_t defer_relog = 0;
static const struct xstats_entry {
char *desc;
@@ -57,24 +58,27 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
/* Loop over all stats groups */
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- len += snprintf(buf + len, PATH_MAX - len, "%s",
+ len += scnprintf(buf + len, PATH_MAX - len, "%s",
xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- len += snprintf(buf + len, PATH_MAX - len, " %u",
+ len += scnprintf(buf + len, PATH_MAX - len, " %u",
counter_val(stats, j));
- len += snprintf(buf + len, PATH_MAX - len, "\n");
+ len += scnprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
+ defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
- len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %llu %llu %llu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
+ defer_relog);
+ len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
@@ -121,7 +125,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
{
int j;
- seq_printf(m, "qm");
+ seq_puts(m, "qm");
for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 34d704f703d2..43ffba74f045 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -137,6 +137,7 @@ struct __xfsstats {
uint64_t xs_xstrat_bytes;
uint64_t xs_write_bytes;
uint64_t xs_read_bytes;
+ uint64_t defer_relog;
};
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2094386af8ac..ee4b429a2f2c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,12 @@
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
+#include "xfs_pwork.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+#include "xfs_iunlink_item.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -47,6 +53,61 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(xfs_mount_list);
+static DEFINE_SPINLOCK(xfs_mount_list_lock);
+
+static inline void xfs_mount_list_add(struct xfs_mount *mp)
+{
+ spin_lock(&xfs_mount_list_lock);
+ list_add(&mp->m_mount_list, &xfs_mount_list);
+ spin_unlock(&xfs_mount_list_lock);
+}
+
+static inline void xfs_mount_list_del(struct xfs_mount *mp)
+{
+ spin_lock(&xfs_mount_list_lock);
+ list_del(&mp->m_mount_list);
+ spin_unlock(&xfs_mount_list_lock);
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
+static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
+#endif
+
+enum xfs_dax_mode {
+ XFS_DAX_INODE = 0,
+ XFS_DAX_ALWAYS = 1,
+ XFS_DAX_NEVER = 2,
+};
+
+static void
+xfs_mount_set_dax_mode(
+ struct xfs_mount *mp,
+ enum xfs_dax_mode mode)
+{
+ switch (mode) {
+ case XFS_DAX_INODE:
+ mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER);
+ break;
+ case XFS_DAX_ALWAYS:
+ mp->m_features |= XFS_FEAT_DAX_ALWAYS;
+ mp->m_features &= ~XFS_FEAT_DAX_NEVER;
+ break;
+ case XFS_DAX_NEVER:
+ mp->m_features |= XFS_FEAT_DAX_NEVER;
+ mp->m_features &= ~XFS_FEAT_DAX_ALWAYS;
+ break;
+ }
+}
+
+static const struct constant_table dax_param_enums[] = {
+ {"inode", XFS_DAX_INODE },
+ {"always", XFS_DAX_ALWAYS },
+ {"never", XFS_DAX_NEVER },
+ {}
+};
+
/*
* Table driven mount option parser.
*/
@@ -59,7 +120,7 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -103,6 +164,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("discard", Opt_discard),
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
+ fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
{}
};
@@ -118,32 +180,32 @@ xfs_fs_show_options(
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_IKEEP, ",ikeep" },
- { XFS_MOUNT_WSYNC, ",wsync" },
- { XFS_MOUNT_NOALIGN, ",noalign" },
- { XFS_MOUNT_SWALLOC, ",swalloc" },
- { XFS_MOUNT_NOUUID, ",nouuid" },
- { XFS_MOUNT_NORECOVERY, ",norecovery" },
- { XFS_MOUNT_ATTR2, ",attr2" },
- { XFS_MOUNT_FILESTREAMS, ",filestreams" },
- { XFS_MOUNT_GRPID, ",grpid" },
- { XFS_MOUNT_DISCARD, ",discard" },
- { XFS_MOUNT_LARGEIO, ",largeio" },
- { XFS_MOUNT_DAX, ",dax" },
+ { XFS_FEAT_IKEEP, ",ikeep" },
+ { XFS_FEAT_WSYNC, ",wsync" },
+ { XFS_FEAT_NOALIGN, ",noalign" },
+ { XFS_FEAT_SWALLOC, ",swalloc" },
+ { XFS_FEAT_NOUUID, ",nouuid" },
+ { XFS_FEAT_NORECOVERY, ",norecovery" },
+ { XFS_FEAT_ATTR2, ",attr2" },
+ { XFS_FEAT_FILESTREAMS, ",filestreams" },
+ { XFS_FEAT_GRPID, ",grpid" },
+ { XFS_FEAT_DISCARD, ",discard" },
+ { XFS_FEAT_LARGE_IOSIZE, ",largeio" },
+ { XFS_FEAT_DAX_ALWAYS, ",dax=always" },
+ { XFS_FEAT_DAX_NEVER, ",dax=never" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
struct proc_xfs_info *xfs_infop;
for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
- if (mp->m_flags & xfs_infop->flag)
+ if (mp->m_features & xfs_infop->flag)
seq_puts(m, xfs_infop->str);
}
- seq_printf(m, ",inode%d",
- (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64);
+ seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64);
- if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ if (xfs_has_allocsize(mp))
seq_printf(m, ",allocsize=%dk",
(1 << mp->m_allocsize_log) >> 10);
@@ -164,23 +226,20 @@ xfs_fs_show_options(
seq_printf(m, ",swidth=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
- if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
+ if (mp->m_qflags & XFS_UQUOTA_ENFD)
seq_puts(m, ",usrquota");
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
seq_puts(m, ",uqnoenforce");
- if (mp->m_qflags & XFS_PQUOTA_ACCT) {
- if (mp->m_qflags & XFS_PQUOTA_ENFD)
- seq_puts(m, ",prjquota");
- else
- seq_puts(m, ",pqnoenforce");
- }
- if (mp->m_qflags & XFS_GQUOTA_ACCT) {
- if (mp->m_qflags & XFS_GQUOTA_ENFD)
- seq_puts(m, ",grpquota");
- else
- seq_puts(m, ",gqnoenforce");
- }
+ if (mp->m_qflags & XFS_PQUOTA_ENFD)
+ seq_puts(m, ",prjquota");
+ else if (mp->m_qflags & XFS_PQUOTA_ACCT)
+ seq_puts(m, ",pqnoenforce");
+
+ if (mp->m_qflags & XFS_GQUOTA_ENFD)
+ seq_puts(m, ",grpquota");
+ else if (mp->m_qflags & XFS_GQUOTA_ACCT)
+ seq_puts(m, ",gqnoenforce");
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
@@ -191,11 +250,11 @@ xfs_fs_show_options(
/*
* Set parameters for inode allocation heuristics, taking into account
* filesystem size and inode32/inode64 mount options; i.e. specifically
- * whether or not XFS_MOUNT_SMALL_INUMS is set.
+ * whether or not XFS_FEAT_SMALL_INUMS is set.
*
* Inode allocation patterns are altered only if inode32 is requested
- * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
- * If altered, XFS_MOUNT_32BITINODES is set as well.
+ * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_OPSTATE_INODE32 is set as well.
*
* An agcount independent of that in the mount structure is provided
* because in the growfs case, mp->m_sb.sb_agcount is not yet updated
@@ -237,13 +296,13 @@ xfs_set_inode_alloc(
/*
* If user asked for no more than 32-bit inodes, and the fs is
- * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
+ * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter
* the allocator to accommodate the request.
*/
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
- mp->m_flags |= XFS_MOUNT_32BITINODES;
+ if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32)
+ set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
else
- mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+ clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
for (index = 0; index < agcount; index++) {
struct xfs_perag *pag;
@@ -252,7 +311,7 @@ xfs_set_inode_alloc(
pag = xfs_perag_get(mp, index);
- if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ if (xfs_is_inode32(mp)) {
if (ino > XFS_MAXINUMBER_32) {
pag->pagi_inodeok = 0;
pag->pagf_metadata = 0;
@@ -272,7 +331,39 @@ xfs_set_inode_alloc(
xfs_perag_put(pag);
}
- return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
+ return xfs_is_inode32(mp) ? maxagi : agcount;
+}
+
+static int
+xfs_setup_dax_always(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_ddev_targp->bt_daxdev &&
+ (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
+ xfs_alert(mp,
+ "DAX unsupported by block device. Turning off DAX.");
+ goto disable_dax;
+ }
+
+ if (mp->m_super->s_blocksize != PAGE_SIZE) {
+ xfs_alert(mp,
+ "DAX not supported for blocksize. Turning off DAX.");
+ goto disable_dax;
+ }
+
+ if (xfs_has_reflink(mp) &&
+ bdev_is_partition(mp->m_ddev_targp->bt_bdev)) {
+ xfs_alert(mp,
+ "DAX and reflink cannot work with multi-partitions!");
+ return -EINVAL;
+ }
+
+ xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ return 0;
+
+disable_dax:
+ xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
+ return 0;
}
STATIC int
@@ -301,37 +392,23 @@ xfs_blkdev_put(
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
-void
-xfs_blkdev_issue_flush(
- xfs_buftarg_t *buftarg)
-{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL);
-}
-
STATIC void
xfs_close_devices(
struct xfs_mount *mp)
{
- struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev;
-
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
- struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;
xfs_free_buftarg(mp->m_logdev_targp);
xfs_blkdev_put(logdev);
- fs_put_dax(dax_logdev);
}
if (mp->m_rtdev_targp) {
struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
- struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;
xfs_free_buftarg(mp->m_rtdev_targp);
xfs_blkdev_put(rtdev);
- fs_put_dax(dax_rtdev);
}
xfs_free_buftarg(mp->m_ddev_targp);
- fs_put_dax(dax_ddev);
}
/*
@@ -349,8 +426,6 @@ xfs_open_devices(
struct xfs_mount *mp)
{
struct block_device *ddev = mp->m_super->s_bdev;
- struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev);
- struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;
@@ -360,8 +435,7 @@ xfs_open_devices(
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
- goto out;
- dax_logdev = fs_dax_get_by_bdev(logdev);
+ return error;
}
if (mp->m_rtname) {
@@ -375,25 +449,24 @@ xfs_open_devices(
error = -EINVAL;
goto out_close_rtdev;
}
- dax_rtdev = fs_dax_get_by_bdev(rtdev);
}
/*
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -409,14 +482,9 @@ xfs_open_devices(
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
xfs_blkdev_put(rtdev);
- fs_put_dax(dax_rtdev);
out_close_logdev:
- if (logdev && logdev != ddev) {
+ if (logdev && logdev != ddev)
xfs_blkdev_put(logdev);
- fs_put_dax(dax_logdev);
- }
- out:
- fs_put_dax(dax_ddev);
return error;
}
@@ -436,7 +504,7 @@ xfs_setup_devices(
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
unsigned int log_sector_size = BBSIZE;
- if (xfs_sb_version_hassector(&mp->m_sb))
+ if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp,
log_sector_size);
@@ -458,44 +526,48 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
- mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
- 0, mp->m_super->s_id);
- if (!mp->m_cil_workqueue)
- goto out_destroy_unwritten;
-
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
- goto out_destroy_cil;
+ goto out_destroy_unwritten;
- mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
- if (!mp->m_eofblocks_workqueue)
+ mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s",
+ XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
+ if (!mp->m_blockgc_wq)
goto out_destroy_reclaim;
- mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- mp->m_super->s_id);
+ mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 1, mp->m_super->s_id);
+ if (!mp->m_inodegc_wq)
+ goto out_destroy_blockgc;
+
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
if (!mp->m_sync_workqueue)
- goto out_destroy_eofb;
+ goto out_destroy_inodegc;
return 0;
-out_destroy_eofb:
- destroy_workqueue(mp->m_eofblocks_workqueue);
+out_destroy_inodegc:
+ destroy_workqueue(mp->m_inodegc_wq);
+out_destroy_blockgc:
+ destroy_workqueue(mp->m_blockgc_wq);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
-out_destroy_cil:
- destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_buf:
@@ -509,13 +581,27 @@ xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
destroy_workqueue(mp->m_sync_workqueue);
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_wq);
+ destroy_workqueue(mp->m_inodegc_wq);
destroy_workqueue(mp->m_reclaim_workqueue);
- destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
destroy_workqueue(mp->m_buf_workqueue);
}
+static void
+xfs_flush_inodes_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(work, struct xfs_mount,
+ m_flush_inodes_work);
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/*
* Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
* or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
@@ -526,12 +612,15 @@ void
xfs_flush_inodes(
struct xfs_mount *mp)
{
- struct super_block *sb = mp->m_super;
+ /*
+ * If flush_work() returns true then that means we waited for a flush
+ * which was already in progress. Don't bother running another scan.
+ */
+ if (flush_work(&mp->m_flush_inodes_work))
+ return;
- if (down_read_trylock(&sb->s_umount)) {
- sync_inodes_sb(sb);
- up_read(&sb->s_umount);
- }
+ queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work);
+ flush_work(&mp->m_flush_inodes_work);
}
/* Catch misguided souls that try to use this interface on XFS */
@@ -543,32 +632,6 @@ xfs_fs_alloc_inode(
return NULL;
}
-#ifdef DEBUG
-static void
-xfs_check_delalloc(
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
-
- if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
- return;
- do {
- if (isnullstartblock(got.br_startblock)) {
- xfs_warn(ip->i_mount,
- "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
- ip->i_ino,
- whichfork == XFS_DATA_FORK ? "data" : "cow",
- got.br_startoff, got.br_blockcount);
- }
- } while (xfs_iext_next_extent(ifp, &icur, &got));
-}
-#else
-#define xfs_check_delalloc(ip, whichfork) do { } while (0)
-#endif
-
/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
@@ -584,37 +647,13 @@ xfs_fs_destroy_inode(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
-
- xfs_inactive(ip);
-
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
- xfs_check_delalloc(ip, XFS_DATA_FORK);
- xfs_check_delalloc(ip, XFS_COW_FORK);
- ASSERT(0);
- }
-
- XFS_STATS_INC(ip->i_mount, vn_reclaim);
-
- /*
- * We should never get here with one of the reclaim flags already set.
- */
- ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
- ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
-
- /*
- * We always use background reclaim here because even if the
- * inode is clean, it still may be under IO and hence we have
- * to take the flush lock. The background reclaim path handles
- * this more efficiently than we can here, so simply let background
- * reclaim tear down all inodes.
- */
- xfs_inode_set_reclaim_tag(ip);
+ xfs_inode_mark_reclaimable(ip);
}
static void
xfs_fs_dirty_inode(
struct inode *inode,
- int flag)
+ int flags)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -622,7 +661,13 @@ xfs_fs_dirty_inode(
if (!(inode->i_sb->s_flags & SB_LAZYTIME))
return;
- if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME))
+
+ /*
+ * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC)
+ * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed
+ * in flags possibly together with I_DIRTY_SYNC.
+ */
+ if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME))
return;
if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp))
@@ -656,8 +701,6 @@ xfs_fs_inode_init_once(
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
}
@@ -681,11 +724,11 @@ xfs_fs_drop_inode(
* that. See the comment for this inode flag.
*/
if (ip->i_flags & XFS_IRECOVERY) {
- ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED);
+ ASSERT(xlog_recovery_needed(ip->i_mount->m_log));
return 0;
}
- return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
+ return generic_drop_inode(inode);
}
static void
@@ -703,6 +746,9 @@ xfs_fs_sync_fs(
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
+ int error;
+
+ trace_xfs_fs_sync_fs(mp, __return_address);
/*
* Doing anything during the async pass would be counterproductive.
@@ -710,7 +756,10 @@ xfs_fs_sync_fs(
if (!wait)
return 0;
- xfs_log_force(mp, XFS_LOG_SYNC);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
@@ -720,6 +769,25 @@ xfs_fs_sync_fs(
flush_delayed_work(&mp->m_log->l_work);
}
+ /*
+ * If we are called with page faults frozen out, it means we are about
+ * to freeze the transaction subsystem. Take the opportunity to shut
+ * down inodegc because once SB_FREEZE_FS is set it's too late to
+ * prevent inactivation races with freeze. The fs doesn't get called
+ * again by the freezing process until after SB_FREEZE_FS has been set,
+ * so it's now or never. Same logic applies to speculative allocation
+ * garbage collection.
+ *
+ * We don't care if this is a normal syncfs call that does this or
+ * freeze that does this - we can run this multiple times without issue
+ * and we won't race with a restart because a restart can only occur
+ * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE.
+ */
+ if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
+ xfs_inodegc_stop(mp);
+ xfs_blockgc_stop(mp);
+ }
+
return 0;
}
@@ -738,12 +806,17 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
+ /*
+ * Expedite background inodegc but don't wait. We do not want to block
+ * here waiting hours for a billion extent file to be truncated.
+ */
+ xfs_inodegc_push(mp);
+
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
- statp->f_fsid.val[0] = (u32)id;
- statp->f_fsid.val[1] = (u32)(id >> 32);
+ statp->f_fsid = u64_to_fsid(id);
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
@@ -755,7 +828,9 @@ xfs_fs_statfs(
statp->f_blocks = sbp->sb_dblocks - lsize;
spin_unlock(&mp->m_sb_lock);
- statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
+ /* make sure statp->f_bfree does not underflow */
+ statp->f_bfree = max_t(int64_t, 0,
+ fdblocks - xfs_fdblocks_unavailable(mp));
statp->f_bavail = statp->f_bfree;
fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
@@ -775,16 +850,18 @@ xfs_fs_statfs(
statp->f_ffree = max_t(int64_t, ffree, 0);
- if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
(XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
xfs_qm_statvfs(ip, statp);
if (XFS_IS_REALTIME_MOUNT(mp) &&
- (ip->i_d.di_flags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+ (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+ s64 freertx;
+
statp->f_blocks = sbp->sb_rblocks;
- statp->f_bavail = statp->f_bfree =
- sbp->sb_frextents * sbp->sb_rextsize;
+ freertx = percpu_counter_sum_positive(&mp->m_frextents);
+ statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
}
return 0;
@@ -814,48 +891,6 @@ xfs_restore_resvblks(struct xfs_mount *mp)
}
/*
- * Trigger writeback of all the dirty metadata in the file system.
- *
- * This ensures that the metadata is written to their location on disk rather
- * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk - this is the
- * primary difference between a sync and a quiesce.
- *
- * Note: xfs_log_quiesce() stops background log work - the callers must ensure
- * it is started again when appropriate.
- */
-void
-xfs_quiesce_attr(
- struct xfs_mount *mp)
-{
- int error = 0;
-
- /* wait for all modifications to complete */
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
-
- /* force the log to unpin objects from the now complete transactions */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /* reclaim inodes to do any IO before the freeze completes */
- xfs_reclaim_inodes(mp, 0);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- /*
- * Just warn here till VFS can correctly support
- * read-only remount without racing.
- */
- WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
- xfs_log_quiesce(mp);
-}
-
-/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
* to the log to dirty it in case of a crash while frozen. This ensures that we
@@ -866,11 +901,32 @@ xfs_fs_freeze(
struct super_block *sb)
{
struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags;
+ int ret;
- xfs_stop_block_reaping(mp);
+ /*
+ * The filesystem is now frozen far enough that memory reclaim
+ * cannot safely operate on the filesystem. Hence we need to
+ * set a GFP_NOFS context here to avoid recursion deadlocks.
+ */
+ flags = memalloc_nofs_save();
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
- return xfs_sync_sb(mp, true);
+ ret = xfs_log_quiesce(mp);
+ memalloc_nofs_restore(flags);
+
+ /*
+ * For read-write filesystems, we need to restart the inodegc on error
+ * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not
+ * going to be run to restart it now. We are at SB_FREEZE_FS level
+ * here, so we can restart safely without racing with a stop in
+ * xfs_fs_sync_fs().
+ */
+ if (ret && !xfs_is_readonly(mp)) {
+ xfs_blockgc_start(mp);
+ xfs_inodegc_start(mp);
+ }
+
+ return ret;
}
STATIC int
@@ -881,7 +937,18 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_start_block_reaping(mp);
+
+ /*
+ * Don't reactivate the inodegc worker on a readonly filesystem because
+ * inodes are sent directly to reclaim. Don't reactivate the blockgc
+ * worker because there are no speculative preallocations on a readonly
+ * filesystem.
+ */
+ if (!xfs_is_readonly(mp)) {
+ xfs_blockgc_start(mp);
+ xfs_inodegc_start(mp);
+ }
+
return 0;
}
@@ -893,10 +960,8 @@ STATIC int
xfs_finish_flags(
struct xfs_mount *mp)
{
- int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
/* Fail a mount where the logbuf is smaller than the log stripe */
- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+ if (xfs_has_logv2(mp)) {
if (mp->m_logbsize <= 0 &&
mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
mp->m_logbsize = mp->m_sb.sb_logsunit;
@@ -918,33 +983,24 @@ xfs_finish_flags(
/*
* V5 filesystems always use attr2 format for attributes.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
"attr2 is always enabled for V5 filesystems.");
return -EINVAL;
}
/*
- * mkfs'ed attr2 will turn on attr2 mount unless explicitly
- * told by noattr2 to turn it off
- */
- if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- !(mp->m_flags & XFS_MOUNT_NOATTR2))
- mp->m_flags |= XFS_MOUNT_ATTR2;
-
- /*
* prohibit r/w mounts of read-only filesystems
*/
- if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
xfs_warn(mp,
"cannot mount a read-only filesystem as read-write");
return -EROFS;
}
- if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
- (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
- !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+ if ((mp->m_qflags & XFS_GQUOTA_ACCT) &&
+ (mp->m_qflags & XFS_PQUOTA_ACCT) &&
+ !xfs_has_pquotino(mp)) {
xfs_warn(mp,
"Super block does not support project and group quota together");
return -EINVAL;
@@ -975,8 +1031,14 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc;
+
return 0;
+free_delalloc:
+ percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
@@ -993,6 +1055,7 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
}
static void
@@ -1002,9 +1065,39 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
- ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
+ percpu_counter_destroy(&mp->m_frextents);
+}
+
+static int
+xfs_inodegc_init_percpu(
+ struct xfs_mount *mp)
+{
+ struct xfs_inodegc *gc;
+ int cpu;
+
+ mp->m_inodegc = alloc_percpu(struct xfs_inodegc);
+ if (!mp->m_inodegc)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ gc = per_cpu_ptr(mp->m_inodegc, cpu);
+ init_llist_head(&gc->list);
+ gc->items = 0;
+ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+ }
+ return 0;
+}
+
+static void
+xfs_inodegc_free_percpu(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_inodegc)
+ return;
+ free_percpu(mp->m_inodegc);
}
static void
@@ -1023,6 +1116,8 @@ xfs_fs_put_super(
xfs_freesb(mp);
free_percpu(mp->m_stats.xs_stats);
+ xfs_mount_list_del(mp);
+ xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1100,17 +1195,33 @@ suffix_kstrtoint(
return ret;
}
+static inline void
+xfs_fs_warn_deprecated(
+ struct fs_context *fc,
+ struct fs_parameter *param,
+ uint64_t flag,
+ bool value)
+{
+ /* Don't print the warning if reconfiguring and current mount point
+ * already had the flag set
+ */
+ if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
+ !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
+ return;
+ xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
+}
+
/*
* Set mount state from a mount option.
*
* NOTE: mp->m_super is NULL here!
*/
static int
-xfs_fc_parse_param(
+xfs_fs_parse_param(
struct fs_context *fc,
struct fs_parameter *param)
{
- struct xfs_mount *mp = fc->s_fs_info;
+ struct xfs_mount *parsing_mp = fc->s_fs_info;
struct fs_parse_result result;
int size = 0;
int opt;
@@ -1121,134 +1232,137 @@ xfs_fc_parse_param(
switch (opt) {
case Opt_logbufs:
- mp->m_logbufs = result.uint_32;
+ parsing_mp->m_logbufs = result.uint_32;
return 0;
case Opt_logbsize:
- if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize))
+ if (suffix_kstrtoint(param->string, 10, &parsing_mp->m_logbsize))
return -EINVAL;
return 0;
case Opt_logdev:
- kfree(mp->m_logname);
- mp->m_logname = kstrdup(param->string, GFP_KERNEL);
- if (!mp->m_logname)
+ kfree(parsing_mp->m_logname);
+ parsing_mp->m_logname = kstrdup(param->string, GFP_KERNEL);
+ if (!parsing_mp->m_logname)
return -ENOMEM;
return 0;
case Opt_rtdev:
- kfree(mp->m_rtname);
- mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
- if (!mp->m_rtname)
+ kfree(parsing_mp->m_rtname);
+ parsing_mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
+ if (!parsing_mp->m_rtname)
return -ENOMEM;
return 0;
case Opt_allocsize:
if (suffix_kstrtoint(param->string, 10, &size))
return -EINVAL;
- mp->m_allocsize_log = ffs(size) - 1;
- mp->m_flags |= XFS_MOUNT_ALLOCSIZE;
+ parsing_mp->m_allocsize_log = ffs(size) - 1;
+ parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE;
return 0;
case Opt_grpid:
case Opt_bsdgroups:
- mp->m_flags |= XFS_MOUNT_GRPID;
+ parsing_mp->m_features |= XFS_FEAT_GRPID;
return 0;
case Opt_nogrpid:
case Opt_sysvgroups:
- mp->m_flags &= ~XFS_MOUNT_GRPID;
+ parsing_mp->m_features &= ~XFS_FEAT_GRPID;
return 0;
case Opt_wsync:
- mp->m_flags |= XFS_MOUNT_WSYNC;
+ parsing_mp->m_features |= XFS_FEAT_WSYNC;
return 0;
case Opt_norecovery:
- mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ parsing_mp->m_features |= XFS_FEAT_NORECOVERY;
return 0;
case Opt_noalign:
- mp->m_flags |= XFS_MOUNT_NOALIGN;
+ parsing_mp->m_features |= XFS_FEAT_NOALIGN;
return 0;
case Opt_swalloc:
- mp->m_flags |= XFS_MOUNT_SWALLOC;
+ parsing_mp->m_features |= XFS_FEAT_SWALLOC;
return 0;
case Opt_sunit:
- mp->m_dalign = result.uint_32;
+ parsing_mp->m_dalign = result.uint_32;
return 0;
case Opt_swidth:
- mp->m_swidth = result.uint_32;
+ parsing_mp->m_swidth = result.uint_32;
return 0;
case Opt_inode32:
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS;
return 0;
case Opt_inode64:
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
return 0;
case Opt_nouuid:
- mp->m_flags |= XFS_MOUNT_NOUUID;
- return 0;
- case Opt_ikeep:
- mp->m_flags |= XFS_MOUNT_IKEEP;
- return 0;
- case Opt_noikeep:
- mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ parsing_mp->m_features |= XFS_FEAT_NOUUID;
return 0;
case Opt_largeio:
- mp->m_flags |= XFS_MOUNT_LARGEIO;
+ parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE;
return 0;
case Opt_nolargeio:
- mp->m_flags &= ~XFS_MOUNT_LARGEIO;
- return 0;
- case Opt_attr2:
- mp->m_flags |= XFS_MOUNT_ATTR2;
- return 0;
- case Opt_noattr2:
- mp->m_flags &= ~XFS_MOUNT_ATTR2;
- mp->m_flags |= XFS_MOUNT_NOATTR2;
+ parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE;
return 0;
case Opt_filestreams:
- mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+ parsing_mp->m_features |= XFS_FEAT_FILESTREAMS;
return 0;
case Opt_noquota:
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
+ parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+ parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
return 0;
case Opt_quota:
case Opt_uquota:
case Opt_usrquota:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
- XFS_UQUOTA_ENFD);
+ parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD);
return 0;
case Opt_qnoenforce:
case Opt_uqnoenforce:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_UQUOTA_ACCT;
+ parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD;
return 0;
case Opt_pquota:
case Opt_prjquota:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_PQUOTA_ENFD);
+ parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD);
return 0;
case Opt_pqnoenforce:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_PQUOTA_ACCT;
+ parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD;
return 0;
case Opt_gquota:
case Opt_grpquota:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_GQUOTA_ENFD);
+ parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD);
return 0;
case Opt_gqnoenforce:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_GQUOTA_ACCT;
+ parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD;
return 0;
case Opt_discard:
- mp->m_flags |= XFS_MOUNT_DISCARD;
+ parsing_mp->m_features |= XFS_FEAT_DISCARD;
return 0;
case Opt_nodiscard:
- mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ parsing_mp->m_features &= ~XFS_FEAT_DISCARD;
return 0;
#ifdef CONFIG_FS_DAX
case Opt_dax:
- mp->m_flags |= XFS_MOUNT_DAX;
+ xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS);
+ return 0;
+ case Opt_dax_enum:
+ xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
+ /* Following mount options will be removed in September 2025 */
+ case Opt_ikeep:
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
+ parsing_mp->m_features |= XFS_FEAT_IKEEP;
+ return 0;
+ case Opt_noikeep:
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
+ parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
+ return 0;
+ case Opt_attr2:
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
+ parsing_mp->m_features |= XFS_FEAT_ATTR2;
+ return 0;
+ case Opt_noattr2:
+ xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
+ parsing_mp->m_features |= XFS_FEAT_NOATTR2;
+ return 0;
default:
- xfs_warn(mp, "unknown mount option [%s].", param->key);
+ xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
}
@@ -1256,20 +1370,26 @@ xfs_fc_parse_param(
}
static int
-xfs_fc_validate_params(
+xfs_fs_validate_params(
struct xfs_mount *mp)
{
+ /* No recovery flag requires a read-only mount */
+ if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) {
+ xfs_warn(mp, "no-recovery mounts must be read-only.");
+ return -EINVAL;
+ }
+
/*
- * no recovery flag requires a read-only mount
+ * We have not read the superblock at this point, so only the attr2
+ * mount option can set the attr2 feature by this stage.
*/
- if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
- !(mp->m_flags & XFS_MOUNT_RDONLY)) {
- xfs_warn(mp, "no-recovery mounts must be read-only.");
+ if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
+ xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
return -EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_NOALIGN) &&
- (mp->m_dalign || mp->m_swidth)) {
+
+ if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
xfs_warn(mp,
"sunit and swidth options incompatible with the noalign option");
return -EINVAL;
@@ -1313,7 +1433,7 @@ xfs_fc_validate_params(
return -EINVAL;
}
- if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
+ if (xfs_has_allocsize(mp) &&
(mp->m_allocsize_log > XFS_MAX_IO_LOG ||
mp->m_allocsize_log < XFS_MIN_IO_LOG)) {
xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
@@ -1325,7 +1445,7 @@ xfs_fc_validate_params(
}
static int
-xfs_fc_fill_super(
+xfs_fs_fill_super(
struct super_block *sb,
struct fs_context *fc)
{
@@ -1335,7 +1455,7 @@ xfs_fc_fill_super(
mp->m_super = sb;
- error = xfs_fc_validate_params(mp);
+ error = xfs_fs_validate_params(mp);
if (error)
goto out_free_names;
@@ -1374,11 +1494,22 @@ xfs_fc_fill_super(
if (error)
goto out_destroy_workqueues;
+ error = xfs_inodegc_init_percpu(mp);
+ if (error)
+ goto out_destroy_counters;
+
+ /*
+ * All percpu data structures requiring cleanup when a cpu goes offline
+ * must be allocated before adding this @mp to the cpu-dead handler's
+ * mount list.
+ */
+ xfs_mount_list_add(mp);
+
/* Allocate stats memory before we do operations that might use it */
mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
if (!mp->m_stats.xs_stats) {
error = -ENOMEM;
- goto out_destroy_counters;
+ goto out_destroy_inodegc;
}
error = xfs_readsb(mp, flags);
@@ -1393,6 +1524,58 @@ xfs_fc_fill_super(
if (error)
goto out_free_sb;
+ /* V4 support is undergoing deprecation. */
+ if (!xfs_has_crc(mp)) {
+#ifdef CONFIG_XFS_SUPPORT_V4
+ xfs_warn_once(mp,
+ "Deprecated V4 format (crc=0) will not be supported after September 2030.");
+#else
+ xfs_warn(mp,
+ "Deprecated V4 format (crc=0) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+#endif
+ }
+
+ /* Filesystem claims it needs repair, so refuse the mount. */
+ if (xfs_has_needsrepair(mp)) {
+ xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair.");
+ error = -EFSCORRUPTED;
+ goto out_free_sb;
+ }
+
+ /*
+ * Don't touch the filesystem if a user tool thinks it owns the primary
+ * superblock. mkfs doesn't clear the flag from secondary supers, so
+ * we don't check them at all.
+ */
+ if (mp->m_sb.sb_inprogress) {
+ xfs_warn(mp, "Offline file system operation in progress!");
+ error = -EFSCORRUPTED;
+ goto out_free_sb;
+ }
+
+ /*
+ * Until this is fixed only page-sized or smaller data blocks work.
+ */
+ if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
+ xfs_warn(mp,
+ "File system with blocksize %d bytes. "
+ "Only pagesize (%ld) or less will currently work.",
+ mp->m_sb.sb_blocksize, PAGE_SIZE);
+ error = -ENOSYS;
+ goto out_free_sb;
+ }
+
+ /* Ensure this filesystem fits in the page cache limits */
+ if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) ||
+ xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) {
+ xfs_warn(mp,
+ "file system too large to be mounted on this system.");
+ error = -EFBIG;
+ goto out_free_sb;
+ }
+
/*
* XFS block mappings use 54 bits to store the logical block offset.
* This should suffice to handle the maximum file size that the VFS
@@ -1404,7 +1587,7 @@ xfs_fc_fill_super(
* Avoid integer overflow by comparing the maximum bmbt offset to the
* maximum pagecache offset in units of fs blocks.
*/
- if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) {
+ if (!xfs_verify_fileoff(mp, XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE))) {
xfs_warn(mp,
"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
@@ -1427,51 +1610,35 @@ xfs_fc_fill_super(
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
- sb->s_time_min = S32_MIN;
- sb->s_time_max = S32_MAX;
+ if (xfs_has_bigtime(mp)) {
+ sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
+ sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
+ } else {
+ sb->s_time_min = XFS_LEGACY_TIME_MIN;
+ sb->s_time_max = XFS_LEGACY_TIME_MAX;
+ }
+ trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
sb->s_iflags |= SB_I_CGROUPWB;
set_posix_acl_flag(sb);
/* version 5 superblocks support inode version counters. */
- if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ if (xfs_has_crc(mp))
sb->s_flags |= SB_I_VERSION;
- if (mp->m_flags & XFS_MOUNT_DAX) {
- bool rtdev_is_dax = false, datadev_is_dax;
-
- xfs_warn(mp,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-
- datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
- sb->s_blocksize);
- if (mp->m_rtdev_targp)
- rtdev_is_dax = bdev_dax_supported(
- mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
- if (!rtdev_is_dax && !datadev_is_dax) {
- xfs_alert(mp,
- "DAX unsupported by block device. Turning off DAX.");
- mp->m_flags &= ~XFS_MOUNT_DAX;
- }
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- xfs_alert(mp,
- "DAX and reflink cannot be used together!");
- error = -EINVAL;
+ if (xfs_has_dax_always(mp)) {
+ error = xfs_setup_dax_always(mp);
+ if (error)
goto out_filestream_unmount;
- }
}
- if (mp->m_flags & XFS_MOUNT_DISCARD) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- xfs_warn(mp, "mounting with \"discard\" option, but "
- "the device does not support discard");
- mp->m_flags &= ~XFS_MOUNT_DISCARD;
- }
+ if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) {
+ xfs_warn(mp,
+ "mounting with \"discard\" option, but the device does not support discard");
+ mp->m_features &= ~XFS_FEAT_DISCARD;
}
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,
"reflink not compatible with realtime device!");
@@ -1485,13 +1652,17 @@ xfs_fc_fill_super(
}
}
- if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
+ if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) {
xfs_alert(mp,
"reverse mapping btree not compatible with realtime device!");
error = -EINVAL;
goto out_filestream_unmount;
}
+ if (xfs_has_large_extent_counts(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1515,6 +1686,9 @@ xfs_fc_fill_super(
xfs_freesb(mp);
out_free_stats:
free_percpu(mp->m_stats.xs_stats);
+ out_destroy_inodegc:
+ xfs_mount_list_del(mp);
+ xfs_inodegc_free_percpu(mp);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
out_destroy_workqueues:
@@ -1533,10 +1707,10 @@ xfs_fc_fill_super(
}
static int
-xfs_fc_get_tree(
+xfs_fs_get_tree(
struct fs_context *fc)
{
- return get_tree_bdev(fc, xfs_fc_fill_super);
+ return get_tree_bdev(fc, xfs_fs_fill_super);
}
static int
@@ -1546,13 +1720,13 @@ xfs_remount_rw(
struct xfs_sb *sbp = &mp->m_sb;
int error;
- if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ if (xfs_has_norecovery(mp)) {
xfs_warn(mp,
"ro->rw transition prohibited on norecovery mount");
return -EINVAL;
}
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ if (xfs_sb_is_v5(sbp) &&
xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
xfs_warn(mp,
"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
@@ -1561,7 +1735,7 @@ xfs_remount_rw(
return -EINVAL;
}
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
/*
* If this is the first remount to writeable state we might have some
@@ -1582,22 +1756,16 @@ xfs_remount_rw(
*/
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
-
- /* Recover any CoW blocks that never got remapped. */
- error = xfs_reflink_recover_cow(mp);
- if (error) {
- xfs_err(mp,
- "Error %d recovering leftover CoW allocations.", error);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
return error;
+ /* Re-enable the background inode inactivation worker. */
+ xfs_inodegc_start(mp);
+
return 0;
}
@@ -1605,21 +1773,43 @@ static int
xfs_remount_ro(
struct xfs_mount *mp)
{
- int error;
+ struct xfs_icwalk icw = {
+ .icw_flags = XFS_ICWALK_FLAG_SYNC,
+ };
+ int error;
+
+ /* Flush all the dirty data to disk. */
+ error = sync_filesystem(mp->m_super);
+ if (error)
+ return error;
/*
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
*/
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
- /* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
+ /*
+ * Clear out all remaining COW staging extents and speculative post-EOF
+ * preallocations so that we don't leave inodes requiring inactivation
+ * cleanups during reclaim on a read-only mount. We must process every
+ * cached inode, so this requires a synchronous cache scan.
+ */
+ error = xfs_blockgc_free_space(mp, &icw);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
+ /*
+ * Stop the inodegc background worker. xfs_fs_reconfigure already
+ * flushed all pending inodegc work when it sync'd the filesystem.
+ * The VFS holds s_umount, so we know that inodes cannot enter
+ * xfs_fs_destroy_inode during a remount operation. In readonly mode
+ * we send inodes straight to reclaim, so no inodes will be queued.
+ */
+ xfs_inodegc_stop(mp);
+
/* Free the per-AG metadata reservation pool. */
error = xfs_fs_unreserve_ag_blocks(mp);
if (error) {
@@ -1636,8 +1826,8 @@ xfs_remount_ro(
*/
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ xfs_log_clean(mp);
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
return 0;
}
@@ -1655,44 +1845,43 @@ xfs_remount_ro(
* silently ignore all options that we can't actually change.
*/
static int
-xfs_fc_reconfigure(
+xfs_fs_reconfigure(
struct fs_context *fc)
{
struct xfs_mount *mp = XFS_M(fc->root->d_sb);
struct xfs_mount *new_mp = fc->s_fs_info;
- xfs_sb_t *sbp = &mp->m_sb;
int flags = fc->sb_flags;
int error;
- error = xfs_fc_validate_params(new_mp);
+ /* version 5 superblocks always support version counters. */
+ if (xfs_has_crc(mp))
+ fc->sb_flags |= SB_I_VERSION;
+
+ error = xfs_fs_validate_params(new_mp);
if (error)
return error;
- sync_filesystem(mp->m_super);
-
/* inode32 -> inode64 */
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
- !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
+ mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
/* inode64 -> inode32 */
- if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
- (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) {
+ mp->m_features |= XFS_FEAT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
/* ro -> rw */
- if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) {
+ if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
error = xfs_remount_rw(mp);
if (error)
return error;
}
/* rw -> ro */
- if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) {
+ if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) {
error = xfs_remount_ro(mp);
if (error)
return error;
@@ -1701,7 +1890,7 @@ xfs_fc_reconfigure(
return 0;
}
-static void xfs_fc_free(
+static void xfs_fs_free(
struct fs_context *fc)
{
struct xfs_mount *mp = fc->s_fs_info;
@@ -1717,10 +1906,10 @@ static void xfs_fc_free(
}
static const struct fs_context_operations xfs_context_ops = {
- .parse_param = xfs_fc_parse_param,
- .get_tree = xfs_fc_get_tree,
- .reconfigure = xfs_fc_reconfigure,
- .free = xfs_fc_free,
+ .parse_param = xfs_fs_parse_param,
+ .get_tree = xfs_fs_get_tree,
+ .reconfigure = xfs_fs_reconfigure,
+ .free = xfs_fs_free,
};
static int xfs_init_fs_context(
@@ -1737,10 +1926,8 @@ static int xfs_init_fs_context(
INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
- atomic_set(&mp->m_active_trans, 0);
+ INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
/*
* We don't create the finobt per-ag space reservation until after log
@@ -1761,11 +1948,11 @@ static int xfs_init_fs_context(
* Copy binary VFS mount flags we are interested in.
*/
if (fc->sb_flags & SB_RDONLY)
- mp->m_flags |= XFS_MOUNT_RDONLY;
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (fc->sb_flags & SB_DIRSYNC)
- mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ mp->m_features |= XFS_FEAT_DIRSYNC;
if (fc->sb_flags & SB_SYNCHRONOUS)
- mp->m_flags |= XFS_MOUNT_WSYNC;
+ mp->m_features |= XFS_FEAT_WSYNC;
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
@@ -1779,200 +1966,233 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("xfs");
STATIC int __init
-xfs_init_zones(void)
+xfs_init_caches(void)
{
- xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket",
+ int error;
+
+ xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!xfs_buf_cache)
+ goto out;
+
+ xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
sizeof(struct xlog_ticket),
0, 0, NULL);
- if (!xfs_log_ticket_zone)
- goto out;
+ if (!xfs_log_ticket_cache)
+ goto out_destroy_buf_cache;
- xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item",
- sizeof(struct xfs_extent_free_item),
- 0, 0, NULL);
- if (!xfs_bmap_free_item_zone)
- goto out_destroy_log_ticket_zone;
+ error = xfs_btree_init_cur_caches();
+ if (error)
+ goto out_destroy_log_ticket_cache;
- xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur",
- sizeof(struct xfs_btree_cur),
- 0, 0, NULL);
- if (!xfs_btree_cur_zone)
- goto out_destroy_bmap_free_item_zone;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_btree_cur_cache;
- xfs_da_state_zone = kmem_cache_create("xfs_da_state",
+ xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
- if (!xfs_da_state_zone)
- goto out_destroy_btree_cur_zone;
+ if (!xfs_da_state_cache)
+ goto out_destroy_defer_item_cache;
- xfs_ifork_zone = kmem_cache_create("xfs_ifork",
+ xfs_ifork_cache = kmem_cache_create("xfs_ifork",
sizeof(struct xfs_ifork),
0, 0, NULL);
- if (!xfs_ifork_zone)
- goto out_destroy_da_state_zone;
+ if (!xfs_ifork_cache)
+ goto out_destroy_da_state_cache;
- xfs_trans_zone = kmem_cache_create("xf_trans",
+ xfs_trans_cache = kmem_cache_create("xfs_trans",
sizeof(struct xfs_trans),
0, 0, NULL);
- if (!xfs_trans_zone)
- goto out_destroy_ifork_zone;
+ if (!xfs_trans_cache)
+ goto out_destroy_ifork_cache;
/*
- * The size of the zone allocated buf log item is the maximum
+ * The size of the cache-allocated buf log item is the maximum
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_cache_create("xfs_buf_item",
+ xfs_buf_item_cache = kmem_cache_create("xfs_buf_item",
sizeof(struct xfs_buf_log_item),
0, 0, NULL);
- if (!xfs_buf_item_zone)
- goto out_destroy_trans_zone;
-
- xfs_efd_zone = kmem_cache_create("xfs_efd_item",
- (sizeof(struct xfs_efd_log_item) +
- (XFS_EFD_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_extent)),
- 0, 0, NULL);
- if (!xfs_efd_zone)
- goto out_destroy_buf_item_zone;
-
- xfs_efi_zone = kmem_cache_create("xfs_efi_item",
- (sizeof(struct xfs_efi_log_item) +
- (XFS_EFI_MAX_FAST_EXTENTS - 1) *
- sizeof(struct xfs_extent)),
- 0, 0, NULL);
- if (!xfs_efi_zone)
- goto out_destroy_efd_zone;
+ if (!xfs_buf_item_cache)
+ goto out_destroy_trans_cache;
- xfs_inode_zone = kmem_cache_create("xfs_inode",
+ xfs_efd_cache = kmem_cache_create("xfs_efd_item",
+ xfs_efd_log_item_sizeof(XFS_EFD_MAX_FAST_EXTENTS),
+ 0, 0, NULL);
+ if (!xfs_efd_cache)
+ goto out_destroy_buf_item_cache;
+
+ xfs_efi_cache = kmem_cache_create("xfs_efi_item",
+ xfs_efi_log_item_sizeof(XFS_EFI_MAX_FAST_EXTENTS),
+ 0, 0, NULL);
+ if (!xfs_efi_cache)
+ goto out_destroy_efd_cache;
+
+ xfs_inode_cache = kmem_cache_create("xfs_inode",
sizeof(struct xfs_inode), 0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD | SLAB_ACCOUNT),
xfs_fs_inode_init_once);
- if (!xfs_inode_zone)
- goto out_destroy_efi_zone;
+ if (!xfs_inode_cache)
+ goto out_destroy_efi_cache;
- xfs_ili_zone = kmem_cache_create("xfs_ili",
+ xfs_ili_cache = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!xfs_ili_zone)
- goto out_destroy_inode_zone;
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!xfs_ili_cache)
+ goto out_destroy_inode_cache;
- xfs_icreate_zone = kmem_cache_create("xfs_icr",
+ xfs_icreate_cache = kmem_cache_create("xfs_icr",
sizeof(struct xfs_icreate_item),
0, 0, NULL);
- if (!xfs_icreate_zone)
- goto out_destroy_ili_zone;
+ if (!xfs_icreate_cache)
+ goto out_destroy_ili_cache;
- xfs_rud_zone = kmem_cache_create("xfs_rud_item",
+ xfs_rud_cache = kmem_cache_create("xfs_rud_item",
sizeof(struct xfs_rud_log_item),
0, 0, NULL);
- if (!xfs_rud_zone)
- goto out_destroy_icreate_zone;
+ if (!xfs_rud_cache)
+ goto out_destroy_icreate_cache;
- xfs_rui_zone = kmem_cache_create("xfs_rui_item",
+ xfs_rui_cache = kmem_cache_create("xfs_rui_item",
xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_rui_zone)
- goto out_destroy_rud_zone;
+ if (!xfs_rui_cache)
+ goto out_destroy_rud_cache;
- xfs_cud_zone = kmem_cache_create("xfs_cud_item",
+ xfs_cud_cache = kmem_cache_create("xfs_cud_item",
sizeof(struct xfs_cud_log_item),
0, 0, NULL);
- if (!xfs_cud_zone)
- goto out_destroy_rui_zone;
+ if (!xfs_cud_cache)
+ goto out_destroy_rui_cache;
- xfs_cui_zone = kmem_cache_create("xfs_cui_item",
+ xfs_cui_cache = kmem_cache_create("xfs_cui_item",
xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_cui_zone)
- goto out_destroy_cud_zone;
+ if (!xfs_cui_cache)
+ goto out_destroy_cud_cache;
- xfs_bud_zone = kmem_cache_create("xfs_bud_item",
+ xfs_bud_cache = kmem_cache_create("xfs_bud_item",
sizeof(struct xfs_bud_log_item),
0, 0, NULL);
- if (!xfs_bud_zone)
- goto out_destroy_cui_zone;
+ if (!xfs_bud_cache)
+ goto out_destroy_cui_cache;
- xfs_bui_zone = kmem_cache_create("xfs_bui_item",
+ xfs_bui_cache = kmem_cache_create("xfs_bui_item",
xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
0, 0, NULL);
- if (!xfs_bui_zone)
- goto out_destroy_bud_zone;
+ if (!xfs_bui_cache)
+ goto out_destroy_bud_cache;
+
+ xfs_attrd_cache = kmem_cache_create("xfs_attrd_item",
+ sizeof(struct xfs_attrd_log_item),
+ 0, 0, NULL);
+ if (!xfs_attrd_cache)
+ goto out_destroy_bui_cache;
+
+ xfs_attri_cache = kmem_cache_create("xfs_attri_item",
+ sizeof(struct xfs_attri_log_item),
+ 0, 0, NULL);
+ if (!xfs_attri_cache)
+ goto out_destroy_attrd_cache;
+
+ xfs_iunlink_cache = kmem_cache_create("xfs_iul_item",
+ sizeof(struct xfs_iunlink_item),
+ 0, 0, NULL);
+ if (!xfs_iunlink_cache)
+ goto out_destroy_attri_cache;
return 0;
- out_destroy_bud_zone:
- kmem_cache_destroy(xfs_bud_zone);
- out_destroy_cui_zone:
- kmem_cache_destroy(xfs_cui_zone);
- out_destroy_cud_zone:
- kmem_cache_destroy(xfs_cud_zone);
- out_destroy_rui_zone:
- kmem_cache_destroy(xfs_rui_zone);
- out_destroy_rud_zone:
- kmem_cache_destroy(xfs_rud_zone);
- out_destroy_icreate_zone:
- kmem_cache_destroy(xfs_icreate_zone);
- out_destroy_ili_zone:
- kmem_cache_destroy(xfs_ili_zone);
- out_destroy_inode_zone:
- kmem_cache_destroy(xfs_inode_zone);
- out_destroy_efi_zone:
- kmem_cache_destroy(xfs_efi_zone);
- out_destroy_efd_zone:
- kmem_cache_destroy(xfs_efd_zone);
- out_destroy_buf_item_zone:
- kmem_cache_destroy(xfs_buf_item_zone);
- out_destroy_trans_zone:
- kmem_cache_destroy(xfs_trans_zone);
- out_destroy_ifork_zone:
- kmem_cache_destroy(xfs_ifork_zone);
- out_destroy_da_state_zone:
- kmem_cache_destroy(xfs_da_state_zone);
- out_destroy_btree_cur_zone:
- kmem_cache_destroy(xfs_btree_cur_zone);
- out_destroy_bmap_free_item_zone:
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- out_destroy_log_ticket_zone:
- kmem_cache_destroy(xfs_log_ticket_zone);
+ out_destroy_attri_cache:
+ kmem_cache_destroy(xfs_attri_cache);
+ out_destroy_attrd_cache:
+ kmem_cache_destroy(xfs_attrd_cache);
+ out_destroy_bui_cache:
+ kmem_cache_destroy(xfs_bui_cache);
+ out_destroy_bud_cache:
+ kmem_cache_destroy(xfs_bud_cache);
+ out_destroy_cui_cache:
+ kmem_cache_destroy(xfs_cui_cache);
+ out_destroy_cud_cache:
+ kmem_cache_destroy(xfs_cud_cache);
+ out_destroy_rui_cache:
+ kmem_cache_destroy(xfs_rui_cache);
+ out_destroy_rud_cache:
+ kmem_cache_destroy(xfs_rud_cache);
+ out_destroy_icreate_cache:
+ kmem_cache_destroy(xfs_icreate_cache);
+ out_destroy_ili_cache:
+ kmem_cache_destroy(xfs_ili_cache);
+ out_destroy_inode_cache:
+ kmem_cache_destroy(xfs_inode_cache);
+ out_destroy_efi_cache:
+ kmem_cache_destroy(xfs_efi_cache);
+ out_destroy_efd_cache:
+ kmem_cache_destroy(xfs_efd_cache);
+ out_destroy_buf_item_cache:
+ kmem_cache_destroy(xfs_buf_item_cache);
+ out_destroy_trans_cache:
+ kmem_cache_destroy(xfs_trans_cache);
+ out_destroy_ifork_cache:
+ kmem_cache_destroy(xfs_ifork_cache);
+ out_destroy_da_state_cache:
+ kmem_cache_destroy(xfs_da_state_cache);
+ out_destroy_defer_item_cache:
+ xfs_defer_destroy_item_caches();
+ out_destroy_btree_cur_cache:
+ xfs_btree_destroy_cur_caches();
+ out_destroy_log_ticket_cache:
+ kmem_cache_destroy(xfs_log_ticket_cache);
+ out_destroy_buf_cache:
+ kmem_cache_destroy(xfs_buf_cache);
out:
return -ENOMEM;
}
STATIC void
-xfs_destroy_zones(void)
+xfs_destroy_caches(void)
{
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(xfs_bui_zone);
- kmem_cache_destroy(xfs_bud_zone);
- kmem_cache_destroy(xfs_cui_zone);
- kmem_cache_destroy(xfs_cud_zone);
- kmem_cache_destroy(xfs_rui_zone);
- kmem_cache_destroy(xfs_rud_zone);
- kmem_cache_destroy(xfs_icreate_zone);
- kmem_cache_destroy(xfs_ili_zone);
- kmem_cache_destroy(xfs_inode_zone);
- kmem_cache_destroy(xfs_efi_zone);
- kmem_cache_destroy(xfs_efd_zone);
- kmem_cache_destroy(xfs_buf_item_zone);
- kmem_cache_destroy(xfs_trans_zone);
- kmem_cache_destroy(xfs_ifork_zone);
- kmem_cache_destroy(xfs_da_state_zone);
- kmem_cache_destroy(xfs_btree_cur_zone);
- kmem_cache_destroy(xfs_bmap_free_item_zone);
- kmem_cache_destroy(xfs_log_ticket_zone);
+ kmem_cache_destroy(xfs_iunlink_cache);
+ kmem_cache_destroy(xfs_attri_cache);
+ kmem_cache_destroy(xfs_attrd_cache);
+ kmem_cache_destroy(xfs_bui_cache);
+ kmem_cache_destroy(xfs_bud_cache);
+ kmem_cache_destroy(xfs_cui_cache);
+ kmem_cache_destroy(xfs_cud_cache);
+ kmem_cache_destroy(xfs_rui_cache);
+ kmem_cache_destroy(xfs_rud_cache);
+ kmem_cache_destroy(xfs_icreate_cache);
+ kmem_cache_destroy(xfs_ili_cache);
+ kmem_cache_destroy(xfs_inode_cache);
+ kmem_cache_destroy(xfs_efi_cache);
+ kmem_cache_destroy(xfs_efd_cache);
+ kmem_cache_destroy(xfs_buf_item_cache);
+ kmem_cache_destroy(xfs_trans_cache);
+ kmem_cache_destroy(xfs_ifork_cache);
+ kmem_cache_destroy(xfs_da_state_cache);
+ xfs_defer_destroy_item_caches();
+ xfs_btree_destroy_cur_caches();
+ kmem_cache_destroy(xfs_log_ticket_cache);
+ kmem_cache_destroy(xfs_buf_cache);
}
STATIC int __init
@@ -1985,11 +2205,12 @@ xfs_init_workqueues(void)
* max_active value for this workqueue.
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
if (!xfs_alloc_wq)
return -ENOMEM;
- xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
+ xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
+ 0);
if (!xfs_discard_wq)
goto out_free_alloc_wq;
@@ -2006,6 +2227,49 @@ xfs_destroy_workqueues(void)
destroy_workqueue(xfs_alloc_wq);
}
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+xfs_cpu_dead(
+ unsigned int cpu)
+{
+ struct xfs_mount *mp, *n;
+
+ spin_lock(&xfs_mount_list_lock);
+ list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
+ spin_unlock(&xfs_mount_list_lock);
+ xfs_inodegc_cpu_dead(mp, cpu);
+ xlog_cil_pcp_dead(mp->m_log, cpu);
+ spin_lock(&xfs_mount_list_lock);
+ }
+ spin_unlock(&xfs_mount_list_lock);
+ return 0;
+}
+
+static int __init
+xfs_cpu_hotplug_init(void)
+{
+ int error;
+
+ error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
+ xfs_cpu_dead);
+ if (error < 0)
+ xfs_alert(NULL,
+"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
+ error);
+ return error;
+}
+
+static void
+xfs_cpu_hotplug_destroy(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline int xfs_cpu_hotplug_init(void) { return 0; }
+static inline void xfs_cpu_hotplug_destroy(void) {}
+#endif
+
STATIC int __init
init_xfs_fs(void)
{
@@ -2018,25 +2282,25 @@ init_xfs_fs(void)
xfs_dir_startup();
- error = xfs_init_zones();
+ error = xfs_cpu_hotplug_init();
if (error)
goto out;
+ error = xfs_init_caches();
+ if (error)
+ goto out_destroy_hp;
+
error = xfs_init_workqueues();
if (error)
- goto out_destroy_zones;
+ goto out_destroy_caches;
error = xfs_mru_cache_init();
if (error)
goto out_destroy_wq;
- error = xfs_buf_init();
- if (error)
- goto out_mru_cache_uninit;
-
error = xfs_init_procfs();
if (error)
- goto out_buf_terminate;
+ goto out_mru_cache_uninit;
error = xfs_sysctl_register();
if (error)
@@ -2093,14 +2357,14 @@ init_xfs_fs(void)
xfs_sysctl_unregister();
out_cleanup_procfs:
xfs_cleanup_procfs();
- out_buf_terminate:
- xfs_buf_terminate();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
out_destroy_wq:
xfs_destroy_workqueues();
- out_destroy_zones:
- xfs_destroy_zones();
+ out_destroy_caches:
+ xfs_destroy_caches();
+ out_destroy_hp:
+ xfs_cpu_hotplug_destroy();
out:
return error;
}
@@ -2118,11 +2382,11 @@ exit_xfs_fs(void)
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
- xfs_buf_terminate();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
- xfs_destroy_zones();
+ xfs_destroy_caches();
xfs_uuid_table_free();
+ xfs_cpu_hotplug_destroy();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index b552cf6d3379..364e2c2648a8 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -75,20 +75,24 @@ extern void xfs_qm_exit(void);
XFS_ASSERT_FATAL_STRING \
XFS_DBG_STRING /* DBG must be last */
+#ifdef DEBUG
+# define XFS_WQFLAGS(wqflags) (WQ_SYSFS | (wqflags))
+#else
+# define XFS_WQFLAGS(wqflags) (wqflags)
+#endif
+
struct xfs_inode;
struct xfs_mount;
struct xfs_buftarg;
struct block_device;
-extern void xfs_quiesce_attr(struct xfs_mount *mp);
extern void xfs_flush_inodes(struct xfs_mount *mp);
-extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
xfs_agnumber_t agcount);
extern const struct export_operations xfs_export_operations;
-extern const struct xattr_handler *xfs_xattr_handlers[];
extern const struct quotactl_ops xfs_quotactl_operations;
+extern const struct dax_holder_operations xfs_dax_holder_operations;
extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index d762d42ed0ff..8389f3ef88ef 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -21,6 +21,8 @@
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
/* ----- Kernel only functions below ----- */
int
@@ -33,7 +35,7 @@ xfs_readlink_bmap_ilocked(
struct xfs_buf *bp;
xfs_daddr_t d;
char *cur_chunk;
- int pathlen = ip->i_d.di_size;
+ int pathlen = ip->i_disk_size;
int nmaps = XFS_SYMLINK_MAPS;
int byte_cnt;
int n;
@@ -62,7 +64,7 @@ xfs_readlink_bmap_ilocked(
byte_cnt = pathlen;
cur_chunk = bp->b_addr;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_has_crc(mp)) {
if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
byte_cnt, bp)) {
error = -EFSCORRUPTED;
@@ -86,7 +88,7 @@ xfs_readlink_bmap_ilocked(
}
ASSERT(pathlen == 0);
- link[ip->i_d.di_size] = '\0';
+ link[ip->i_disk_size] = '\0';
error = 0;
out:
@@ -95,23 +97,21 @@ xfs_readlink_bmap_ilocked(
int
xfs_readlink(
- struct xfs_inode *ip,
- char *link)
+ struct xfs_inode *ip,
+ char *link)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fsize_t pathlen;
- int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t pathlen;
+ int error = -EFSCORRUPTED;
trace_xfs_readlink(ip);
- ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE));
-
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- pathlen = ip->i_d.di_size;
+ pathlen = ip->i_disk_size;
if (!pathlen)
goto out;
@@ -120,12 +120,22 @@ xfs_readlink(
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- error = -EFSCORRUPTED;
goto out;
}
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ /*
+ * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
+ * if if_data is junk.
+ */
+ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
+ goto out;
- error = xfs_readlink_bmap_ilocked(ip, link);
+ memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+ error = 0;
+ } else {
+ error = xfs_readlink_bmap_ilocked(ip, link);
+ }
out:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -134,6 +144,7 @@ xfs_readlink(
int
xfs_symlink(
+ struct user_namespace *mnt_userns,
struct xfs_inode *dp,
struct xfs_name *link_name,
const char *target_path,
@@ -154,18 +165,19 @@ xfs_symlink(
const char *cur_chunk;
int byte_cnt;
int n;
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
struct xfs_dquot *pdqp = NULL;
uint resblks;
+ xfs_ino_t ino;
*ipp = NULL;
trace_xfs_symlink(dp, link_name);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
/*
@@ -176,15 +188,13 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
- udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp,
- xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns, &init_user_ns),
+ mapped_fsgid(mnt_userns, &init_user_ns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -194,15 +204,16 @@ xfs_symlink(
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
*/
- if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+ if (pathlen <= XFS_LITINO(mp))
fs_blocks = 0;
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
+ pdqp, resblks, &tp);
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -210,24 +221,19 @@ xfs_symlink(
/*
* Check whether the directory allows new symlinks or not.
*/
- if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+ if (dp->i_diflags & XFS_DIFLAG_NOSYMLINKS) {
error = -EPERM;
goto out_trans_cancel;
}
/*
- * Reserve disk quota : blocks and inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
- if (error)
- goto out_trans_cancel;
-
- /*
* Allocate an inode for the symlink.
*/
- error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
- prid, &ip);
+ error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
+ if (!error)
+ error = xfs_init_new_inode(mnt_userns, tp, dp, ino,
+ S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
+ false, &ip);
if (error)
goto out_trans_cancel;
@@ -246,16 +252,15 @@ xfs_symlink(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- if (resblks)
- resblks -= XFS_IALLOC_SPACE_RES(mp);
+ resblks -= XFS_IALLOC_SPACE_RES(mp);
/*
* If the symlink will fit into the inode, write it inline.
*/
- if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ if (pathlen <= xfs_inode_data_fork_size(ip)) {
xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
- ip->i_d.di_size = pathlen;
- ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ ip->i_disk_size = pathlen;
+ ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
} else {
int offset;
@@ -268,9 +273,8 @@ xfs_symlink(
if (error)
goto out_trans_cancel;
- if (resblks)
- resblks -= fs_blocks;
- ip->i_d.di_size = pathlen;
+ resblks -= fs_blocks;
+ ip->i_disk_size = pathlen;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
cur_chunk = target_path;
@@ -305,6 +309,7 @@ xfs_symlink(
}
ASSERT(pathlen == 0);
}
+ i_size_write(VFS_I(ip), ip->i_disk_size);
/*
* Create the directory entry for the symlink.
@@ -320,9 +325,8 @@ xfs_symlink(
* symlink transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
- }
error = xfs_trans_commit(tp);
if (error)
@@ -347,7 +351,7 @@ out_release_inode:
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -370,7 +374,7 @@ STATIC int
xfs_inactive_symlink_rmt(
struct xfs_inode *ip)
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
int done;
int error;
int i;
@@ -381,7 +385,7 @@ xfs_inactive_symlink_rmt(
xfs_trans_t *tp;
mp = ip->i_mount;
- ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
+ ASSERT(!xfs_need_iread_extents(&ip->i_df));
/*
* We're freeing a symlink that has some
* blocks allocated to it. Free the
@@ -389,7 +393,7 @@ xfs_inactive_symlink_rmt(
* either 1 or 2 extents and that we can
* free them all in one bunmapi call.
*/
- ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+ ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
@@ -404,8 +408,8 @@ xfs_inactive_symlink_rmt(
* locked for the second transaction. In the error paths we need it
* held so the cancel won't rele it, see below.
*/
- size = (int)ip->i_d.di_size;
- ip->i_d.di_size = 0;
+ size = (int)ip->i_disk_size;
+ ip->i_disk_size = 0;
VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
@@ -444,7 +448,7 @@ xfs_inactive_symlink_rmt(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(xfs_is_shutdown(mp));
goto error_unlock;
}
@@ -477,11 +481,11 @@ xfs_inactive_symlink(
trace_xfs_inactive_symlink(ip);
- if (XFS_FORCED_SHUTDOWN(mp))
+ if (xfs_is_shutdown(mp))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- pathlen = (int)ip->i_d.di_size;
+ pathlen = (int)ip->i_disk_size;
ASSERT(pathlen);
if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) {
@@ -496,7 +500,7 @@ xfs_inactive_symlink(
* Inline fork state gets removed by xfs_difree() so we have nothing to
* do here in that case.
*/
- if (ip->i_df.if_flags & XFS_IFINLINE) {
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b1fa091427e6..2586b7e393f3 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -7,8 +7,9 @@
/* Kernel only symlink definitions */
-int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
- const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp,
+ struct xfs_name *link_name, const char *target_path,
+ umode_t mode, struct xfs_inode **ipp);
int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_inactive_symlink(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 31b3bdbd2eba..546a6cd96729 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -13,7 +13,7 @@ STATIC int
xfs_stats_clear_proc_handler(
struct ctl_table *ctl,
int write,
- void __user *buffer,
+ void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -33,7 +33,7 @@ STATIC int
xfs_panic_mask_proc_handler(
struct ctl_table *ctl,
int write,
- void __user *buffer,
+ void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -50,13 +50,29 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
+STATIC int
+xfs_deprecated_dointvec_minmax(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_ratelimited(KERN_WARNING
+ "XFS: %s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -65,7 +81,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
@@ -162,21 +178,21 @@ static struct ctl_table xfs_table[] = {
},
{
.procname = "speculative_prealloc_lifetime",
- .data = &xfs_params.eofb_timer.val,
+ .data = &xfs_params.blockgc_timer.val,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xfs_params.eofb_timer.min,
- .extra2 = &xfs_params.eofb_timer.max,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
},
{
.procname = "speculative_cow_prealloc_lifetime",
- .data = &xfs_params.cowb_timer.val,
+ .data = &xfs_params.blockgc_timer.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &xfs_params.cowb_timer.min,
- .extra2 = &xfs_params.cowb_timer.max,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
},
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 8abf4640f1d5..f78ad6b10ea5 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -35,8 +35,7 @@ typedef struct xfs_param {
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
- xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
- xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */
+ xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */
} xfs_param_t;
/*
@@ -84,6 +83,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
#ifdef DEBUG
int pwork_threads; /* parallel workqueue threads */
+ bool larp; /* log attribute replay */
#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index f1bc88f4367c..f7faf6e70d7f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sysfs.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
@@ -66,11 +67,12 @@ static const struct sysfs_ops xfs_sysfs_ops = {
static struct attribute *xfs_mp_attrs[] = {
NULL,
};
+ATTRIBUTE_GROUPS(xfs_mp);
struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_mp_attrs,
+ .default_groups = xfs_mp_groups,
};
#ifdef DEBUG
@@ -104,7 +106,7 @@ bug_on_assert_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0);
+ return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert);
}
XFS_SYSFS_ATTR_RW(bug_on_assert);
@@ -134,7 +136,7 @@ log_recovery_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay);
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
@@ -164,7 +166,7 @@ mount_delay_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+ return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay);
}
XFS_SYSFS_ATTR_RW(mount_delay);
@@ -187,7 +189,7 @@ always_cow_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+ return sysfs_emit(buf, "%d\n", xfs_globals.always_cow);
}
XFS_SYSFS_ATTR_RW(always_cow);
@@ -223,9 +225,32 @@ pwork_threads_show(
struct kobject *kobject,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+ return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.larp);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+STATIC ssize_t
+larp_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
static struct attribute *xfs_dbg_attrs[] = {
@@ -235,14 +260,16 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(always_cow),
#ifdef DEBUG
ATTR_LIST(pwork_threads),
+ ATTR_LIST(larp),
#endif
NULL,
};
+ATTRIBUTE_GROUPS(xfs_dbg);
struct kobj_type xfs_dbg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_dbg_attrs,
+ .default_groups = xfs_dbg_groups,
};
#endif /* DEBUG */
@@ -295,11 +322,12 @@ static struct attribute *xfs_stats_attrs[] = {
ATTR_LIST(stats_clear),
NULL,
};
+ATTRIBUTE_GROUPS(xfs_stats);
struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_stats_attrs,
+ .default_groups = xfs_stats_groups,
};
/* xlog */
@@ -326,7 +354,7 @@ log_head_lsn_show(
block = log->l_curr_block;
spin_unlock(&log->l_icloglock);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_head_lsn);
@@ -340,7 +368,7 @@ log_tail_lsn_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+ return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_tail_lsn);
@@ -355,7 +383,7 @@ reserve_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(reserve_grant_head);
@@ -369,7 +397,7 @@ write_grant_head_show(
struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
- return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
@@ -380,11 +408,12 @@ static struct attribute *xfs_log_attrs[] = {
ATTR_LIST(write_grant_head),
NULL,
};
+ATTRIBUTE_GROUPS(xfs_log);
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_log_attrs,
+ .default_groups = xfs_log_groups,
};
/*
@@ -424,7 +453,7 @@ max_retries_show(
else
retries = cfg->max_retries;
- return snprintf(buf, PAGE_SIZE, "%d\n", retries);
+ return sysfs_emit(buf, "%d\n", retries);
}
static ssize_t
@@ -465,7 +494,7 @@ retry_timeout_seconds_show(
else
timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
- return snprintf(buf, PAGE_SIZE, "%d\n", timeout);
+ return sysfs_emit(buf, "%d\n", timeout);
}
static ssize_t
@@ -503,7 +532,7 @@ fail_at_unmount_show(
{
struct xfs_mount *mp = err_to_mp(kobject);
- return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount);
+ return sysfs_emit(buf, "%d\n", mp->m_fail_unmount);
}
static ssize_t
@@ -533,12 +562,12 @@ static struct attribute *xfs_error_attrs[] = {
ATTR_LIST(retry_timeout_seconds),
NULL,
};
-
+ATTRIBUTE_GROUPS(xfs_error);
static struct kobj_type xfs_error_cfg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
- .default_attrs = xfs_error_attrs,
+ .default_groups = xfs_error_groups,
};
static struct kobj_type xfs_error_ktype = {
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index e9f810fc6731..513095e353a5 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -32,9 +32,16 @@ xfs_sysfs_init(
struct xfs_kobj *parent_kobj,
const char *name)
{
+ struct kobject *parent;
+ int err;
+
+ parent = parent_kobj ? &parent_kobj->kobject : NULL;
init_completion(&kobj->complete);
- return kobject_init_and_add(&kobj->kobject, ktype,
- &parent_kobj->kobject, "%s", name);
+ err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name);
+ if (err)
+ kobject_put(&kobj->kobject);
+
+ return err;
}
static inline void
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index bc85b89f88ca..d269ef57ff01 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -19,6 +20,7 @@
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
@@ -27,6 +29,11 @@
#include "xfs_log_recover.h"
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
+#include "xfs_btree_staging.h"
+#include "xfs_icache.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_error.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e242988f57fb..372d871bccc5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2,6 +2,41 @@
/*
* Copyright (c) 2009, Christoph Hellwig
* All Rights Reserved.
+ *
+ * NOTE: none of these tracepoints shall be considered a stable kernel ABI
+ * as they can change at any time.
+ *
+ * Current conventions for printing numbers measuring specific units:
+ *
+ * agno: allocation group number
+ *
+ * agino: per-AG inode number
+ * ino: filesystem inode number
+ *
+ * agbno: per-AG block number in fs blocks
+ * startblock: physical block number for file mappings. This is either a
+ * segmented fsblock for data device mappings, or a rfsblock
+ * for realtime device mappings
+ * fsbcount: number of blocks in an extent, in fs blocks
+ *
+ * daddr: physical block number in 512b blocks
+ * bbcount: number of blocks in a physical extent, in 512b blocks
+ *
+ * owner: reverse-mapping owner, usually inodes
+ *
+ * fileoff: file offset, in fs blocks
+ * pos: file offset, in bytes
+ * bytecount: number of bytes
+ *
+ * disize: ondisk file size, in bytes
+ * isize: incore file size, in bytes
+ *
+ * forkoff: inode fork offset, in bytes
+ *
+ * ireccount: number of inode records
+ *
+ * Numbers describing space allocations (blocks, extents, inodes) should be
+ * formatted in hexadecimal.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM xfs
@@ -24,6 +59,7 @@ struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
struct xlog_rec_header;
+struct xlog_in_core;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
@@ -35,6 +71,14 @@ struct xfs_icreate_log;
struct xfs_owner_info;
struct xfs_trans_res;
struct xfs_inobt_rec_incore;
+union xfs_btree_ptr;
+struct xfs_dqtrx;
+struct xfs_icwalk;
+
+#define XFS_ATTR_FILTER_FLAGS \
+ { XFS_ATTR_ROOT, "ROOT" }, \
+ { XFS_ATTR_SECURE, "SECURE" }, \
+ { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -45,39 +89,39 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
__field(u32, hashval)
__field(u32, blkno)
__field(u32, offset)
- __field(void *, alist)
+ __field(void *, buffer)
__field(int, bufsize)
__field(int, count)
__field(int, firstu)
__field(int, dupcnt)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
),
TP_fast_assign(
__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
__entry->ino = ctx->dp->i_ino;
- __entry->hashval = ctx->cursor->hashval;
- __entry->blkno = ctx->cursor->blkno;
- __entry->offset = ctx->cursor->offset;
- __entry->alist = ctx->alist;
+ __entry->hashval = ctx->cursor.hashval;
+ __entry->blkno = ctx->cursor.blkno;
+ __entry->offset = ctx->cursor.offset;
+ __entry->buffer = ctx->buffer;
__entry->bufsize = ctx->bufsize;
__entry->count = ctx->count;
__entry->firstu = ctx->firstu;
- __entry->flags = ctx->flags;
+ __entry->attr_filter = ctx->attr_filter;
),
TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
- "alist %p size %u count %u firstu %u flags %d %s",
+ "buffer %p size %u count %u firstu %u filter %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->hashval,
__entry->blkno,
__entry->offset,
__entry->dupcnt,
- __entry->alist,
+ __entry->buffer,
__entry->bufsize,
__entry->count,
__entry->firstu,
- __entry->flags,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS)
)
)
@@ -96,6 +140,24 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
+TRACE_EVENT(xlog_intent_recovery_failed,
+ TP_PROTO(struct xfs_mount *mp, int error, void *function),
+ TP_ARGS(mp, error, function),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, error)
+ __field(void *, function)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->error = error;
+ __entry->function = function;
+ ),
+ TP_printk("dev %d:%d error %d function %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->error, __entry->function)
+);
+
DECLARE_EVENT_CLASS(xfs_perag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
unsigned long caller_ip),
@@ -112,7 +174,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
__entry->refcount = refcount;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u refcount %d caller %pS",
+ TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->refcount,
@@ -127,12 +189,87 @@ DEFINE_EVENT(xfs_perag_class, name, \
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+
+TRACE_EVENT(xfs_inodegc_worker,
+ TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
+ TP_ARGS(mp, shrinker_hits),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, shrinker_hits)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->shrinker_hits = shrinker_hits;
+ ),
+ TP_printk("dev %d:%d shrinker_hits %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->shrinker_hits)
+);
+
+DECLARE_EVENT_CLASS(xfs_fs_class,
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip),
+ TP_ARGS(mp, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long long, mflags)
+ __field(unsigned long, opstate)
+ __field(unsigned long, sbflags)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ if (mp) {
+ __entry->dev = mp->m_super->s_dev;
+ __entry->mflags = mp->m_features;
+ __entry->opstate = mp->m_opstate;
+ __entry->sbflags = mp->m_super->s_flags;
+ }
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->mflags,
+ __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS),
+ __entry->sbflags,
+ __entry->caller_ip)
+);
+
+#define DEFINE_FS_EVENT(name) \
+DEFINE_EVENT(xfs_fs_class, name, \
+ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
+ TP_ARGS(mp, caller_ip))
+DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_push);
+DEFINE_FS_EVENT(xfs_inodegc_start);
+DEFINE_FS_EVENT(xfs_inodegc_stop);
+DEFINE_FS_EVENT(xfs_inodegc_queue);
+DEFINE_FS_EVENT(xfs_inodegc_throttle);
+DEFINE_FS_EVENT(xfs_fs_sync_fs);
+DEFINE_FS_EVENT(xfs_blockgc_start);
+DEFINE_FS_EVENT(xfs_blockgc_stop);
+DEFINE_FS_EVENT(xfs_blockgc_worker);
+DEFINE_FS_EVENT(xfs_blockgc_flush_all);
+
+TRACE_EVENT(xfs_inodegc_shrinker_scan,
+ TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc,
+ void *caller_ip),
+ TP_ARGS(mp, sc, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, nr_to_scan)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->nr_to_scan = sc->nr_to_scan;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d nr_to_scan %lu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_to_scan,
+ __entry->caller_ip)
+);
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -145,7 +282,7 @@ DECLARE_EVENT_CLASS(xfs_ag_class,
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
),
- TP_printk("dev %d:%d agno %u",
+ TP_printk("dev %d:%d agno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno)
);
@@ -169,31 +306,31 @@ TRACE_EVENT(xfs_attr_list_node_descend,
__field(u32, hashval)
__field(u32, blkno)
__field(u32, offset)
- __field(void *, alist)
+ __field(void *, buffer)
__field(int, bufsize)
__field(int, count)
__field(int, firstu)
__field(int, dupcnt)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
__field(u32, bt_hashval)
__field(u32, bt_before)
),
TP_fast_assign(
__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
__entry->ino = ctx->dp->i_ino;
- __entry->hashval = ctx->cursor->hashval;
- __entry->blkno = ctx->cursor->blkno;
- __entry->offset = ctx->cursor->offset;
- __entry->alist = ctx->alist;
+ __entry->hashval = ctx->cursor.hashval;
+ __entry->blkno = ctx->cursor.blkno;
+ __entry->offset = ctx->cursor.offset;
+ __entry->buffer = ctx->buffer;
__entry->bufsize = ctx->bufsize;
__entry->count = ctx->count;
__entry->firstu = ctx->firstu;
- __entry->flags = ctx->flags;
+ __entry->attr_filter = ctx->attr_filter;
__entry->bt_hashval = be32_to_cpu(btree->hashval);
__entry->bt_before = be32_to_cpu(btree->before);
),
TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
- "alist %p size %u count %u firstu %u flags %d %s "
+ "buffer %p size %u count %u firstu %u filter %s "
"node hashval %u, node before %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -201,12 +338,12 @@ TRACE_EVENT(xfs_attr_list_node_descend,
__entry->blkno,
__entry->offset,
__entry->dupcnt,
- __entry->alist,
+ __entry->buffer,
__entry->bufsize,
__entry->count,
__entry->firstu,
- __entry->flags,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS),
__entry->bt_hashval,
__entry->bt_before)
);
@@ -245,7 +382,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d "
- "offset %lld block %lld count %lld flag %d caller %pS",
+ "fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -282,22 +419,21 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__field(unsigned, lockval)
__field(unsigned, flags)
__field(unsigned long, caller_ip)
+ __field(const void *, buf_ops)
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- if (bp->b_bn == XFS_BUF_DADDR_NULL)
- __entry->bno = bp->b_maps[0].bm_bn;
- else
- __entry->bno = bp->b_bn;
+ __entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
+ __entry->buf_ops = bp->b_ops;
),
- TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
- "lock %d flags %s caller %pS",
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
+ "lock %d flags %s bufops %pS caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->nblks,
@@ -305,6 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->pincount,
__entry->lockval,
__print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ __entry->buf_ops,
(void *)__entry->caller_ip)
)
@@ -331,9 +468,9 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
-DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
+DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
/* not really buffer traces, but the buf provides useful information */
@@ -347,7 +484,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, bno)
- __field(size_t, buffer_length)
+ __field(unsigned int, length)
__field(int, hold)
__field(int, pincount)
__field(unsigned, lockval)
@@ -356,19 +493,19 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- __entry->bno = bp->b_bn;
- __entry->buffer_length = BBTOB(bp->b_length);
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->length = bp->b_length;
__entry->flags = flags;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
- __entry->buffer_length,
+ __entry->length,
__entry->hold,
__entry->pincount,
__entry->lockval,
@@ -390,7 +527,7 @@ TRACE_EVENT(xfs_buf_ioerror,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, bno)
- __field(size_t, buffer_length)
+ __field(unsigned int, length)
__field(unsigned, flags)
__field(int, hold)
__field(int, pincount)
@@ -400,8 +537,8 @@ TRACE_EVENT(xfs_buf_ioerror,
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- __entry->bno = bp->b_bn;
- __entry->buffer_length = BBTOB(bp->b_length);
+ __entry->bno = xfs_buf_daddr(bp);
+ __entry->length = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count;
@@ -409,11 +546,11 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d error %d flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
- __entry->buffer_length,
+ __entry->length,
__entry->hold,
__entry->pincount,
__entry->lockval,
@@ -428,7 +565,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_daddr_t, buf_bno)
- __field(size_t, buf_len)
+ __field(unsigned int, buf_len)
__field(int, buf_hold)
__field(int, buf_pincount)
__field(int, buf_lockval)
@@ -443,15 +580,15 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->bli_flags = bip->bli_flags;
__entry->bli_recur = bip->bli_recur;
__entry->bli_refcount = atomic_read(&bip->bli_refcount);
- __entry->buf_bno = bip->bli_buf->b_bn;
- __entry->buf_len = BBTOB(bip->bli_buf->b_length);
+ __entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
+ __entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags;
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags;
),
- TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
"lock %d flags %s recur %d refcount %d bliflags %s "
"liflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -511,7 +648,7 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
__entry->agno = agno;
__entry->streams = xfs_filestream_peek_ag(mp, agno);
),
- TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->agno,
@@ -545,7 +682,7 @@ TRACE_EVENT(xfs_filestream_pick,
__entry->free = free;
__entry->nscan = nscan;
),
- TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->agno,
@@ -593,14 +730,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
+ __field(unsigned long, iflags)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
+ __entry->iflags = ip->i_flags;
),
- TP_printk("dev %d:%d ino 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino)
+ __entry->ino,
+ __entry->iflags)
)
#define DEFINE_INODE_EVENT(name) \
@@ -608,8 +748,8 @@ DEFINE_EVENT(xfs_inode_class, name, \
TP_PROTO(struct xfs_inode *ip), \
TP_ARGS(ip))
DEFINE_INODE_EVENT(xfs_iget_skip);
-DEFINE_INODE_EVENT(xfs_iget_reclaim);
-DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_recycle);
+DEFINE_INODE_EVENT(xfs_iget_recycle_fail);
DEFINE_INODE_EVENT(xfs_iget_hit);
DEFINE_INODE_EVENT(xfs_iget_miss);
@@ -644,6 +784,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_reclaimable);
+DEFINE_INODE_EVENT(xfs_inode_reclaiming);
+DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
+DEFINE_INODE_EVENT(xfs_inode_inactivating);
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -655,6 +799,9 @@ TRACE_DEFINE_ENUM(PE_SIZE_PTE);
TRACE_DEFINE_ENUM(PE_SIZE_PMD);
TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
@@ -750,9 +897,12 @@ TRACE_EVENT(xfs_irec_merge_pre,
__entry->nagino = nagino;
__entry->nholemask = holemask;
),
- TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
- __entry->agino, __entry->holemask, __entry->nagino,
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->holemask,
+ __entry->nagino,
__entry->nholemask)
)
@@ -772,8 +922,11 @@ TRACE_EVENT(xfs_irec_merge_post,
__entry->agino = agino;
__entry->holemask = holemask;
),
- TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
- MINOR(__entry->dev), __entry->agno, __entry->agino,
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x",
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
__entry->holemask)
)
@@ -787,7 +940,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
DECLARE_EVENT_CLASS(xfs_namespace_class,
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
TP_ARGS(dp, name),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -810,7 +963,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
#define DEFINE_NAMESPACE_EVENT(name) \
DEFINE_EVENT(xfs_namespace_class, name, \
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \
TP_ARGS(dp, name))
DEFINE_NAMESPACE_EVENT(xfs_remove);
DEFINE_NAMESPACE_EVENT(xfs_link);
@@ -858,44 +1011,65 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u32, id)
+ __field(xfs_dqtype_t, type)
__field(unsigned, flags)
__field(unsigned, nrefs)
__field(unsigned long long, res_bcount)
+ __field(unsigned long long, res_rtbcount)
+ __field(unsigned long long, res_icount)
+
__field(unsigned long long, bcount)
+ __field(unsigned long long, rtbcount)
__field(unsigned long long, icount)
+
__field(unsigned long long, blk_hardlimit)
__field(unsigned long long, blk_softlimit)
+ __field(unsigned long long, rtb_hardlimit)
+ __field(unsigned long long, rtb_softlimit)
__field(unsigned long long, ino_hardlimit)
__field(unsigned long long, ino_softlimit)
- ), \
+ ),
TP_fast_assign(
__entry->dev = dqp->q_mount->m_super->s_dev;
- __entry->id = be32_to_cpu(dqp->q_core.d_id);
- __entry->flags = dqp->dq_flags;
+ __entry->id = dqp->q_id;
+ __entry->type = dqp->q_type;
+ __entry->flags = dqp->q_flags;
__entry->nrefs = dqp->q_nrefs;
- __entry->res_bcount = dqp->q_res_bcount;
- __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
- __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
- __entry->blk_hardlimit =
- be64_to_cpu(dqp->q_core.d_blk_hardlimit);
- __entry->blk_softlimit =
- be64_to_cpu(dqp->q_core.d_blk_softlimit);
- __entry->ino_hardlimit =
- be64_to_cpu(dqp->q_core.d_ino_hardlimit);
- __entry->ino_softlimit =
- be64_to_cpu(dqp->q_core.d_ino_softlimit);
- ),
- TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+
+ __entry->res_bcount = dqp->q_blk.reserved;
+ __entry->res_rtbcount = dqp->q_rtb.reserved;
+ __entry->res_icount = dqp->q_ino.reserved;
+
+ __entry->bcount = dqp->q_blk.count;
+ __entry->rtbcount = dqp->q_rtb.count;
+ __entry->icount = dqp->q_ino.count;
+
+ __entry->blk_hardlimit = dqp->q_blk.hardlimit;
+ __entry->blk_softlimit = dqp->q_blk.softlimit;
+ __entry->rtb_hardlimit = dqp->q_rtb.hardlimit;
+ __entry->rtb_softlimit = dqp->q_rtb.softlimit;
+ __entry->ino_hardlimit = dqp->q_ino.hardlimit;
+ __entry->ino_softlimit = dqp->q_ino.softlimit;
+ ),
+ TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u "
+ "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx "
"bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+ "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx "
"icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->id,
- __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
__entry->nrefs,
__entry->res_bcount,
+ __entry->res_rtbcount,
+ __entry->res_icount,
__entry->bcount,
__entry->blk_hardlimit,
__entry->blk_softlimit,
+ __entry->rtbcount,
+ __entry->rtb_hardlimit,
+ __entry->rtb_softlimit,
__entry->icount,
__entry->ino_hardlimit,
__entry->ino_softlimit)
@@ -926,6 +1100,109 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
DEFINE_DQUOT_EVENT(xfs_dqflush);
DEFINE_DQUOT_EVENT(xfs_dqflush_force);
DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
+
+TRACE_EVENT(xfs_trans_mod_dquot,
+ TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
+ unsigned int field, int64_t delta),
+ TP_ARGS(tp, dqp, field, delta),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, type)
+ __field(unsigned int, flags)
+ __field(unsigned int, dqid)
+ __field(unsigned int, field)
+ __field(int64_t, delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = tp->t_mountp->m_super->s_dev;
+ __entry->type = dqp->q_type;
+ __entry->flags = dqp->q_flags;
+ __entry->dqid = dqp->q_id;
+ __entry->field = field;
+ __entry->delta = delta;
+ ),
+ TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dqid,
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+ __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS),
+ __entry->delta)
+);
+
+DECLARE_EVENT_CLASS(xfs_dqtrx_class,
+ TP_PROTO(struct xfs_dqtrx *qtrx),
+ TP_ARGS(qtrx),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, type)
+ __field(unsigned int, flags)
+ __field(u32, dqid)
+
+ __field(uint64_t, blk_res)
+ __field(int64_t, bcount_delta)
+ __field(int64_t, delbcnt_delta)
+
+ __field(uint64_t, rtblk_res)
+ __field(uint64_t, rtblk_res_used)
+ __field(int64_t, rtbcount_delta)
+ __field(int64_t, delrtb_delta)
+
+ __field(uint64_t, ino_res)
+ __field(uint64_t, ino_res_used)
+ __field(int64_t, icount_delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev;
+ __entry->type = qtrx->qt_dquot->q_type;
+ __entry->flags = qtrx->qt_dquot->q_flags;
+ __entry->dqid = qtrx->qt_dquot->q_id;
+
+ __entry->blk_res = qtrx->qt_blk_res;
+ __entry->bcount_delta = qtrx->qt_bcount_delta;
+ __entry->delbcnt_delta = qtrx->qt_delbcnt_delta;
+
+ __entry->rtblk_res = qtrx->qt_rtblk_res;
+ __entry->rtblk_res_used = qtrx->qt_rtblk_res_used;
+ __entry->rtbcount_delta = qtrx->qt_rtbcount_delta;
+ __entry->delrtb_delta = qtrx->qt_delrtb_delta;
+
+ __entry->ino_res = qtrx->qt_ino_res;
+ __entry->ino_res_used = qtrx->qt_ino_res_used;
+ __entry->icount_delta = qtrx->qt_icount_delta;
+ ),
+ TP_printk("dev %d:%d dquot id 0x%x type %s flags %s "
+ "blk_res %llu bcount_delta %lld delbcnt_delta %lld "
+ "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
+ "ino_res %llu ino_res_used %llu icount_delta %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dqid,
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+
+ __entry->blk_res,
+ __entry->bcount_delta,
+ __entry->delbcnt_delta,
+
+ __entry->rtblk_res,
+ __entry->rtblk_res_used,
+ __entry->rtbcount_delta,
+ __entry->delrtb_delta,
+
+ __entry->ino_res,
+ __entry->ino_res_used,
+ __entry->icount_delta)
+)
+
+#define DEFINE_DQTRX_EVENT(name) \
+DEFINE_EVENT(xfs_dqtrx_class, name, \
+ TP_PROTO(struct xfs_dqtrx *qtrx), \
+ TP_ARGS(qtrx))
+DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after);
DECLARE_EVENT_CLASS(xfs_loggrant_class,
TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
@@ -995,8 +1272,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
@@ -1005,12 +1280,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1023,7 +1299,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
@@ -1063,6 +1339,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1076,7 +1355,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
__field(xfs_lsn_t, new_lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
@@ -1123,8 +1402,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
- TP_ARGS(ip, count, offset),
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
+ TP_ARGS(iocb, iter),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -1133,13 +1412,13 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(size_t, count)
),
TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
- __entry->offset = offset;
- __entry->count = count;
+ __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+ __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
+ __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_disk_size;
+ __entry->offset = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1149,14 +1428,16 @@ DECLARE_EVENT_CLASS(xfs_file_class,
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
- TP_ARGS(ip, count, offset))
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), \
+ TP_ARGS(iocb, iter))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
+DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
@@ -1176,7 +1457,7 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
+ __entry->size = ip->i_disk_size;
__entry->offset = offset;
__entry->count = count;
__entry->whichfork = whichfork;
@@ -1184,14 +1465,14 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx "
+ "fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
@@ -1222,12 +1503,12 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->isize = VFS_I(ip)->i_size;
- __entry->disize = ip->i_d.di_size;
+ __entry->disize = ip->i_disk_size;
__entry->offset = offset;
__entry->count = count;
),
TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
- "offset 0x%llx count %zd",
+ "pos 0x%llx bytecount 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
@@ -1260,10 +1541,10 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
+ __entry->size = ip->i_disk_size;
__entry->new_size = new_size;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1290,11 +1571,11 @@ TRACE_EVENT(xfs_pagecache_inval,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
+ __entry->size = ip->i_disk_size;
__entry->start = start;
__entry->finish = finish;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1303,14 +1584,14 @@ TRACE_EVENT(xfs_pagecache_inval,
);
TRACE_EVENT(xfs_bunmap,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
int flags, unsigned long caller_ip),
- TP_ARGS(ip, bno, len, flags, caller_ip),
+ TP_ARGS(ip, fileoff, len, flags, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
- __field(xfs_fileoff_t, bno)
+ __field(xfs_fileoff_t, fileoff)
__field(xfs_filblks_t, len)
__field(unsigned long, caller_ip)
__field(int, flags)
@@ -1318,18 +1599,18 @@ TRACE_EVENT(xfs_bunmap,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
- __entry->bno = bno;
+ __entry->size = ip->i_disk_size;
+ __entry->fileoff = fileoff;
__entry->len = len;
__entry->caller_ip = caller_ip;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx "
"flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->bno,
+ __entry->fileoff,
__entry->len,
__print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
(void *)__entry->caller_ip)
@@ -1352,7 +1633,7 @@ DECLARE_EVENT_CLASS(xfs_extent_busy_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1390,7 +1671,7 @@ TRACE_EVENT(xfs_extent_busy_trim,
__entry->tbno = tbno;
__entry->tlen = tlen;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1437,7 +1718,7 @@ DECLARE_EVENT_CLASS(xfs_agf_class,
__entry->longest = be32_to_cpu(agf->agf_longest);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+ TP_printk("dev %d:%d agno 0x%x flags %s length %u roots b %u c %u "
"levels b %u c %u flfirst %u fllast %u flcount %u "
"freeblks %u longest %u caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1486,7 +1767,7 @@ TRACE_EVENT(xfs_free_extent,
__entry->haveleft = haveleft;
__entry->haveright = haveright;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1543,7 +1824,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->datatype = args->datatype;
__entry->firstblock = args->tp->t_firstblock;
),
- TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u "
"prod %u minleft %u total %u alignment %u minalignslop %u "
"len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
"datatype 0x%x firstblock 0x%llx",
@@ -1621,7 +1902,7 @@ TRACE_EVENT(xfs_alloc_cur_check,
__entry->diff = diff;
__entry->new = new;
),
- TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d",
+ TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->bno, __entry->len, __entry->diff, __entry->new)
@@ -1637,7 +1918,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(int, namelen)
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1701,8 +1982,9 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(int, namelen)
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
- __field(int, flags)
- __field(int, op_flags)
+ __field(unsigned int, attr_filter)
+ __field(unsigned int, attr_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1712,11 +1994,12 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen = args->namelen;
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
- __entry->flags = args->flags;
+ __entry->attr_filter = args->attr_filter;
+ __entry->attr_flags = args->attr_flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x flags %s op_flags %s",
+ "hashval 0x%x filter %s flags %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1724,7 +2007,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen,
__entry->valuelen,
__entry->hashval,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS),
+ __print_flags(__entry->attr_flags, "|",
+ { XATTR_CREATE, "CREATE" },
+ { XATTR_REPLACE, "REPLACE" }),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -1743,7 +2030,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
-DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
@@ -1773,7 +2059,6 @@ DEFINE_ATTR_EVENT(xfs_attr_refillstate);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
-DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
@@ -1806,7 +2091,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, idx)
),
TP_fast_assign(
@@ -1837,7 +2122,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, src_idx)
__field(int, dst_idx)
__field(int, count)
@@ -1878,7 +2163,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(int, which)
__field(xfs_ino_t, ino)
__field(int, format)
- __field(int, nex)
+ __field(xfs_extnum_t, nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -1886,13 +2171,13 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->which = which;
__entry->ino = ip->i_ino;
- __entry->format = ip->i_d.di_format;
- __entry->nex = ip->i_d.di_nextents;
+ __entry->format = ip->i_df.if_format;
+ __entry->nex = ip->i_df.if_nextents;
__entry->broot_size = ip->i_df.if_broot_bytes;
- __entry->fork_off = XFS_IFORK_BOFF(ip);
+ __entry->fork_off = xfs_inode_fork_boff(ip);
),
- TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
- "broot size %d, fork offset %d",
+ TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
+ "broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
@@ -2018,7 +2303,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
__entry->size = buf_f->blf_size;
__entry->map_size = buf_f->blf_map_size;
),
- TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+ TP_printk("dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, "
"map_size %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->blkno,
@@ -2069,7 +2354,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
__entry->boffset = in_f->ilf_boffset;
),
TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
- "dsize %d, blkno 0x%llx, len %d, boffset %d",
+ "dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -2110,10 +2395,14 @@ DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
__entry->length = be32_to_cpu(in_f->icl_length);
__entry->gen = be32_to_cpu(in_f->icl_gen);
),
- TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u "
- "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->agno, __entry->agbno, __entry->count, __entry->isize,
- __entry->length, __entry->gen)
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->length,
+ __entry->count,
+ __entry->isize,
+ __entry->gen)
)
#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
@@ -2139,7 +2428,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2181,8 +2470,8 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
__entry->btnum = cur->bc_btnum;
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
- __entry->ptr = cur->bc_ptrs[level];
- __entry->daddr = bp ? bp->b_bn : -1;
+ __entry->ptr = cur->bc_levels[level].ptr;
+ __entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
),
TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -2298,7 +2587,7 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d op %d agno %u agbno %u len %u",
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->agno,
@@ -2345,13 +2634,13 @@ DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
__entry->l_state = state;
__entry->op = op;
),
- TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d",
+ TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->op,
__entry->agno,
__entry->agbno,
__entry->ino,
- __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->l_loff,
__entry->l_len,
__entry->l_state)
@@ -2381,6 +2670,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -2414,7 +2704,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
if (unwritten)
__entry->flags |= XFS_RMAP_UNWRITTEN;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2447,7 +2737,7 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class,
__entry->error = error;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u error %d caller %pS",
+ TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->error,
@@ -2494,7 +2784,7 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
__entry->offset = offset;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2563,7 +2853,7 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
__entry->asked = r ? r->ar_asked : 0;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u "
+ TP_printk("dev %d:%d agno 0x%x resv %d freeblks %u flcount %u "
"resv %u ask %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
@@ -2616,7 +2906,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__entry->agbno = agbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2638,6 +2928,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
@@ -2645,13 +2936,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
@@ -2671,6 +2964,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
__field(xfs_nlink_t, refcount)
@@ -2679,14 +2973,16 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount,
@@ -2707,9 +3003,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
@@ -2717,20 +3015,24 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount)
@@ -2751,9 +3053,11 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
@@ -2762,21 +3066,25 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u @ agbno %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
@@ -2799,12 +3107,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, i1_domain)
__field(xfs_agblock_t, i1_startblock)
__field(xfs_extlen_t, i1_blockcount)
__field(xfs_nlink_t, i1_refcount)
+ __field(enum xfs_refc_domain, i2_domain)
__field(xfs_agblock_t, i2_startblock)
__field(xfs_extlen_t, i2_blockcount)
__field(xfs_nlink_t, i2_refcount)
+ __field(enum xfs_refc_domain, i3_domain)
__field(xfs_agblock_t, i3_startblock)
__field(xfs_extlen_t, i3_blockcount)
__field(xfs_nlink_t, i3_refcount)
@@ -2812,27 +3123,33 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
+ __entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
__entry->i1_refcount = i1->rc_refcount;
+ __entry->i2_domain = i2->rc_domain;
__entry->i2_startblock = i2->rc_startblock;
__entry->i2_blockcount = i2->rc_blockcount;
__entry->i2_refcount = i2->rc_refcount;
+ __entry->i3_domain = i3->rc_domain;
__entry->i3_startblock = i3->rc_startblock;
__entry->i3_blockcount = i3->rc_blockcount;
__entry->i3_refcount = i3->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- "
+ "dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i1_startblock,
__entry->i1_blockcount,
__entry->i1_refcount,
+ __print_symbolic(__entry->i2_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i2_startblock,
__entry->i2_blockcount,
__entry->i2_refcount,
+ __print_symbolic(__entry->i3_domain, XFS_REFC_DOMAIN_STRINGS),
__entry->i3_startblock,
__entry->i3_blockcount,
__entry->i3_refcount)
@@ -2911,7 +3228,7 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
__entry->new_agbno = new_agbno;
__entry->new_len = new_len;
),
- TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u",
+ TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->agno,
@@ -2937,7 +3254,7 @@ DECLARE_EVENT_CLASS(xfs_inode_error_class,
__entry->error = error;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino %llx error %d caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx error %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->error,
@@ -2963,7 +3280,7 @@ DECLARE_EVENT_CLASS(xfs_double_io_class,
__field(loff_t, src_isize)
__field(loff_t, src_disize)
__field(loff_t, src_offset)
- __field(size_t, len)
+ __field(long long, len)
__field(xfs_ino_t, dest_ino)
__field(loff_t, dest_isize)
__field(loff_t, dest_disize)
@@ -2973,17 +3290,17 @@ DECLARE_EVENT_CLASS(xfs_double_io_class,
__entry->dev = VFS_I(src)->i_sb->s_dev;
__entry->src_ino = src->i_ino;
__entry->src_isize = VFS_I(src)->i_size;
- __entry->src_disize = src->i_d.di_size;
+ __entry->src_disize = src->i_disk_size;
__entry->src_offset = soffset;
__entry->len = len;
__entry->dest_ino = dest->i_ino;
__entry->dest_isize = VFS_I(dest)->i_size;
- __entry->dest_disize = dest->i_d.di_size;
+ __entry->dest_disize = dest->i_disk_size;
__entry->dest_offset = doffset;
),
- TP_printk("dev %d:%d count %zd "
- "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> "
- "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx",
+ TP_printk("dev %d:%d bytecount 0x%llx "
+ "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> "
+ "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->len,
__entry->src_ino,
@@ -3022,7 +3339,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
__entry->pblk = irec->br_startblock;
__entry->state = irec->br_state;
),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
+ TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->lblk,
@@ -3041,8 +3358,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
-TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+TRACE_EVENT(xfs_reflink_remap_blocks,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
xfs_fileoff_t doffset),
@@ -3063,9 +3379,8 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop,
__entry->dest_ino = dest->i_ino;
__entry->dest_lblk = doffset;
),
- TP_printk("dev %d:%d len 0x%llx "
- "ino 0x%llx offset 0x%llx blocks -> "
- "ino 0x%llx offset 0x%llx blocks",
+ TP_printk("dev %d:%d fsbcount 0x%llx "
+ "ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->len,
__entry->src_ino,
@@ -3073,59 +3388,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop,
__entry->dest_ino,
__entry->dest_lblk)
);
-TRACE_EVENT(xfs_reflink_punch_range,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
- xfs_extlen_t len),
- TP_ARGS(ip, lblk, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->len)
-);
-TRACE_EVENT(xfs_reflink_remap,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
- xfs_extlen_t len, xfs_fsblock_t new_pblk),
- TP_ARGS(ip, lblk, len, new_pblk),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_extlen_t, len)
- __field(xfs_fsblock_t, new_pblk)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->len = len;
- __entry->new_pblk = new_pblk;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->len,
- __entry->new_pblk)
-);
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
/* dedupe tracepoints */
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
@@ -3149,9 +3419,7 @@ TRACE_EVENT(xfs_ioctl_clone,
__entry->dest_ino = dest->i_ino;
__entry->dest_isize = i_size_read(dest);
),
- TP_printk("dev %d:%d "
- "ino 0x%lx isize 0x%llx -> "
- "ino 0x%lx isize 0x%llx",
+ TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_ino,
__entry->src_isize,
@@ -3169,11 +3437,10 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -3189,7 +3456,7 @@ DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
/* fsmap traces */
DECLARE_EVENT_CLASS(xfs_fsmap_class,
TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
- struct xfs_rmap_irec *rmap),
+ const struct xfs_rmap_irec *rmap),
TP_ARGS(mp, keydev, agno, rmap),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -3211,7 +3478,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
__entry->offset = rmap->rm_offset;
__entry->flags = rmap->rm_flags;
),
- TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x",
+ TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->agno,
@@ -3224,7 +3491,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(xfs_fsmap_class, name, \
TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
- struct xfs_rmap_irec *rmap), \
+ const struct xfs_rmap_irec *rmap), \
TP_ARGS(mp, keydev, agno, rmap))
DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
@@ -3251,7 +3518,7 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class,
__entry->offset = fsmap->fmr_offset;
__entry->flags = fsmap->fmr_flags;
),
- TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx",
+ TP_printk("dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->block,
@@ -3268,7 +3535,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
-TRACE_EVENT(xfs_trans_resv_calc,
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
TP_PROTO(struct xfs_mount *mp, unsigned int type,
struct xfs_trans_res *res),
TP_ARGS(mp, type, res),
@@ -3292,6 +3559,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logres,
__entry->logcount,
__entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+ struct xfs_trans_res *res), \
+ TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+ TP_ARGS(mp, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint, logres)
+ __field(int, logcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->logres = res->tr_logres;
+ __entry->logcount = res->tr_logcount;
+ ),
+ TP_printk("dev %d:%d logres %u logcount %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->logres,
+ __entry->logcount)
);
DECLARE_EVENT_CLASS(xfs_trans_class,
@@ -3350,7 +3644,7 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
__entry->old_ptr = old_ptr;
__entry->new_ptr = new_ptr;
),
- TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ TP_printk("dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->bucket,
@@ -3376,7 +3670,7 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__entry->old_ptr = old_ptr;
__entry->new_ptr = new_ptr;
),
- TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agino,
@@ -3397,7 +3691,7 @@ DECLARE_EVENT_CLASS(xfs_ag_inode_class,
__entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
__entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
),
- TP_printk("dev %d:%d agno %u agino %u",
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno, __entry->agino)
)
@@ -3408,7 +3702,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
TP_ARGS(ip))
DEFINE_AGINODE_EVENT(xfs_iunlink);
DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
-DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
TP_PROTO(struct xfs_mount *mp, unsigned int flags),
@@ -3449,7 +3742,7 @@ DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
__entry->agno = agno;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno %u flags 0x%x",
+ TP_printk("dev %d:%d agno 0x%x flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno, __entry->flags)
);
@@ -3500,7 +3793,7 @@ TRACE_EVENT(xfs_iwalk_ag,
__entry->agno = agno;
__entry->startino = startino;
),
- TP_printk("dev %d:%d agno %d startino %u",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
__entry->startino)
)
@@ -3521,7 +3814,7 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
__entry->startino = irec->ir_startino;
__entry->freemask = irec->ir_free;
),
- TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
__entry->startino, __entry->freemask)
)
@@ -3568,10 +3861,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \
TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
TP_ARGS(size, flags, caller_ip))
DEFINE_KMEM_EVENT(kmem_alloc);
-DEFINE_KMEM_EVENT(kmem_alloc_io);
-DEFINE_KMEM_EVENT(kmem_alloc_large);
-DEFINE_KMEM_EVENT(kmem_realloc);
-DEFINE_KMEM_EVENT(kmem_zone_alloc);
TRACE_EVENT(xfs_check_new_dalign,
TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
@@ -3588,12 +3877,367 @@ TRACE_EVENT(xfs_check_new_dalign,
__entry->sb_rootino = mp->m_sb.sb_rootino;
__entry->calc_rootino = calc_rootino;
),
- TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu",
+ TP_printk("dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->new_dalign, __entry->sb_rootino,
__entry->calc_rootino)
)
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+ TP_PROTO(struct xfs_btree_cur *cur),
+ TP_ARGS(cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, levels)
+ __field(unsigned int, blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agbno = cur->bc_ag.afake->af_root;
+ __entry->levels = cur->bc_ag.afake->af_levels;
+ __entry->blocks = cur->bc_ag.afake->af_blocks;
+ ),
+ TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->agno,
+ __entry->levels,
+ __entry->blocks,
+ __entry->agbno)
+)
+
+TRACE_EVENT(xfs_btree_commit_ifakeroot,
+ TP_PROTO(struct xfs_btree_cur *cur),
+ TP_ARGS(cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, levels)
+ __field(unsigned int, blocks)
+ __field(int, whichfork)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
+ cur->bc_ino.ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
+ cur->bc_ino.ip->i_ino);
+ __entry->levels = cur->bc_ino.ifake->if_levels;
+ __entry->blocks = cur->bc_ino.ifake->if_blocks;
+ __entry->whichfork = cur->bc_ino.whichfork;
+ ),
+ TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->agno,
+ __entry->agino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->levels,
+ __entry->blocks)
+)
+
+TRACE_EVENT(xfs_btree_bload_level_geometry,
+ TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+ uint64_t nr_this_level, unsigned int nr_per_block,
+ unsigned int desired_npb, uint64_t blocks,
+ uint64_t blocks_with_extra),
+ TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks,
+ blocks_with_extra),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(unsigned int, level)
+ __field(unsigned int, nlevels)
+ __field(uint64_t, nr_this_level)
+ __field(unsigned int, nr_per_block)
+ __field(unsigned int, desired_npb)
+ __field(unsigned long long, blocks)
+ __field(unsigned long long, blocks_with_extra)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->nr_this_level = nr_this_level;
+ __entry->nr_per_block = nr_per_block;
+ __entry->desired_npb = desired_npb;
+ __entry->blocks = blocks;
+ __entry->blocks_with_extra = blocks_with_extra;
+ ),
+ TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->level,
+ __entry->nlevels,
+ __entry->nr_this_level,
+ __entry->nr_per_block,
+ __entry->desired_npb,
+ __entry->blocks,
+ __entry->blocks_with_extra)
+)
+
+TRACE_EVENT(xfs_btree_bload_block,
+ TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+ uint64_t block_idx, uint64_t nr_blocks,
+ union xfs_btree_ptr *ptr, unsigned int nr_records),
+ TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(unsigned int, level)
+ __field(unsigned long long, block_idx)
+ __field(unsigned long long, nr_blocks)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, nr_records)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->block_idx = block_idx;
+ __entry->nr_blocks = nr_blocks;
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
+
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
+ __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
+ } else {
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agbno = be32_to_cpu(ptr->s);
+ }
+ __entry->nr_records = nr_records;
+ ),
+ TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->level,
+ __entry->block_idx,
+ __entry->nr_blocks,
+ __entry->agno,
+ __entry->agbno,
+ __entry->nr_records)
+)
+
+DECLARE_EVENT_CLASS(xfs_timestamp_range_class,
+ TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max),
+ TP_ARGS(mp, min, max),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(long long, min)
+ __field(long long, max)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->min = min;
+ __entry->max = max;
+ ),
+ TP_printk("dev %d:%d min %lld max %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->min,
+ __entry->max)
+)
+
+#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
+DEFINE_EVENT(xfs_timestamp_range_class, name, \
+ TP_PROTO(struct xfs_mount *mp, long long min, long long max), \
+ TP_ARGS(mp, min, max))
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+
+DECLARE_EVENT_CLASS(xfs_icwalk_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw,
+ unsigned long caller_ip),
+ TP_ARGS(mp, icw, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(__u32, flags)
+ __field(uint32_t, uid)
+ __field(uint32_t, gid)
+ __field(prid_t, prid)
+ __field(__u64, min_file_size)
+ __field(long, scan_limit)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = icw ? icw->icw_flags : 0;
+ __entry->uid = icw ? from_kuid(mp->m_super->s_user_ns,
+ icw->icw_uid) : 0;
+ __entry->gid = icw ? from_kgid(mp->m_super->s_user_ns,
+ icw->icw_gid) : 0;
+ __entry->prid = icw ? icw->icw_prid : 0;
+ __entry->min_file_size = icw ? icw->icw_min_file_size : 0;
+ __entry->scan_limit = icw ? icw->icw_scan_limit : 0;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags,
+ __entry->uid,
+ __entry->gid,
+ __entry->prid,
+ __entry->min_file_size,
+ __entry->scan_limit,
+ (char *)__entry->caller_ip)
+);
+#define DEFINE_ICWALK_EVENT(name) \
+DEFINE_EVENT(xfs_icwalk_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, icw, caller_ip))
+DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks);
+DEFINE_ICWALK_EVENT(xfs_blockgc_free_space);
+
+TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE);
+TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC);
+TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING);
+TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC);
+TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK);
+TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY);
+
+DECLARE_EVENT_CLASS(xlog_iclog_class,
+ TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip),
+ TP_ARGS(iclog, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint32_t, state)
+ __field(int32_t, refcount)
+ __field(uint32_t, offset)
+ __field(uint32_t, flags)
+ __field(unsigned long long, lsn)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = iclog->ic_log->l_mp->m_super->s_dev;
+ __entry->state = iclog->ic_state;
+ __entry->refcount = atomic_read(&iclog->ic_refcnt);
+ __entry->offset = iclog->ic_offset;
+ __entry->flags = iclog->ic_flags;
+ __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->state, XLOG_STATE_STRINGS),
+ __entry->refcount,
+ __entry->offset,
+ __entry->lsn,
+ __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
+ (char *)__entry->caller_ip)
+
+);
+
+#define DEFINE_ICLOG_EVENT(name) \
+DEFINE_EVENT(xlog_iclog_class, name, \
+ TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \
+ TP_ARGS(iclog, caller_ip))
+
+DEFINE_ICLOG_EVENT(xlog_iclog_activate);
+DEFINE_ICLOG_EVENT(xlog_iclog_clean);
+DEFINE_ICLOG_EVENT(xlog_iclog_callback);
+DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start);
+DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done);
+DEFINE_ICLOG_EVENT(xlog_iclog_force);
+DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn);
+DEFINE_ICLOG_EVENT(xlog_iclog_get_space);
+DEFINE_ICLOG_EVENT(xlog_iclog_release);
+DEFINE_ICLOG_EVENT(xlog_iclog_switch);
+DEFINE_ICLOG_EVENT(xlog_iclog_sync);
+DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
+DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
+DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
+DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
+DEFINE_ICLOG_EVENT(xlog_iclog_write);
+
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
+DECLARE_EVENT_CLASS(xfs_das_state_class,
+ TP_PROTO(int das, struct xfs_inode *ip),
+ TP_ARGS(das, ip),
+ TP_STRUCT__entry(
+ __field(int, das)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->das = das;
+ __entry->ino = ip->i_ino;
+ ),
+ TP_printk("state change %s ino 0x%llx",
+ __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+ __entry->ino)
+)
+
+#define DEFINE_DAS_STATE_EVENT(name) \
+DEFINE_EVENT(xfs_das_state_class, name, \
+ TP_PROTO(int das, struct xfs_inode *ip), \
+ TP_ARGS(das, ip))
+DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
+
+TRACE_EVENT(xfs_force_shutdown,
+ TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
+ int line_num),
+ TP_ARGS(mp, ptag, flags, fname, line_num),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, ptag)
+ __field(int, flags)
+ __string(fname, fname)
+ __field(int, line_num)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ptag = ptag;
+ __entry->flags = flags;
+ __assign_str(fname, fname);
+ __entry->line_num = line_num;
+ ),
+ TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags(__entry->ptag, "|", XFS_PTAG_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_SHUTDOWN_STRINGS),
+ __get_str(fname),
+ __entry->line_num)
+);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 3b208f9a865c..7bd16fbff534 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -16,18 +16,22 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_icache.h"
-kmem_zone_t *xfs_trans_zone;
+struct kmem_cache *xfs_trans_cache;
#if defined(CONFIG_TRACEPOINTS)
static void
xfs_trans_trace_reservations(
struct xfs_mount *mp)
{
- struct xfs_trans_res resv;
struct xfs_trans_res *res;
struct xfs_trans_res *end_res;
int i;
@@ -36,8 +40,6 @@ xfs_trans_trace_reservations(
end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (i = 0; res < end_res; i++, res++)
trace_xfs_trans_resv_calc(mp, i, res);
- xfs_log_get_max_trans_res(mp, &resv);
- trace_xfs_trans_resv_calc(mp, -1, &resv);
}
#else
# define xfs_trans_trace_reservations(mp)
@@ -67,11 +69,11 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
- atomic_dec(&tp->t_mountp->m_active_trans);
+ xfs_trans_clear_context(tp);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
- kmem_cache_free(xfs_trans_zone, tp);
+ kmem_cache_free(xfs_trans_cache, tp);
}
/*
@@ -90,7 +92,7 @@ xfs_trans_dup(
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
+ ntp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
/*
* Initialize the new transaction structure.
@@ -107,7 +109,8 @@ xfs_trans_dup(
ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
(tp->t_flags & XFS_TRANS_RESERVE) |
- (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
+ (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) |
+ (tp->t_flags & XFS_TRANS_RES_FDBLKS);
/* We gave our writer reference to the new transaction */
tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
@@ -118,14 +121,13 @@ xfs_trans_dup(
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
- ntp->t_pflags = tp->t_pflags;
+
+ xfs_trans_switch_context(tp, ntp);
/* move deferred ops over to the new tp */
xfs_defer_move(ntp, tp);
xfs_trans_dup_dqinfo(tp, ntp);
-
- atomic_inc(&tp->t_mountp->m_active_trans);
return ntp;
}
@@ -150,11 +152,9 @@ xfs_trans_reserve(
uint blocks,
uint rtextents)
{
- int error = 0;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-
- /* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ struct xfs_mount *mp = tp->t_mountp;
+ int error = 0;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/*
* Attempt to reserve the needed disk blocks by decrementing
@@ -162,11 +162,9 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
- if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
+ if (error != 0)
return -ENOSPC;
- }
tp->t_blk_res += blocks;
}
@@ -191,13 +189,11 @@ xfs_trans_reserve(
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(tp->t_mountp,
- resp->tr_logres,
+ error = xfs_log_reserve(mp, resp->tr_logres,
resp->tr_logcount,
- &tp->t_ticket, XFS_TRANSACTION,
- permanent);
+ &tp->t_ticket, permanent);
}
if (error)
@@ -213,7 +209,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+ error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -229,7 +225,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -237,12 +233,9 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+ xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
-
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
return error;
}
@@ -256,6 +249,7 @@ xfs_trans_alloc(
struct xfs_trans **tpp)
{
struct xfs_trans *tp;
+ bool want_retry = true;
int error;
/*
@@ -263,9 +257,11 @@ xfs_trans_alloc(
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
- tp = kmem_zone_zalloc(xfs_trans_zone, 0);
+retry:
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
/*
* Zero-reservation ("empty") transactions can't modify anything, so
@@ -273,7 +269,8 @@ xfs_trans_alloc(
*/
WARN_ON(resp->tr_logres > 0 &&
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- atomic_inc(&mp->m_active_trans);
+ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
+ xfs_has_lazysbcount(mp));
tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_flags = flags;
@@ -284,6 +281,19 @@ xfs_trans_alloc(
tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error == -ENOSPC && want_retry) {
+ xfs_trans_cancel(tp);
+
+ /*
+ * We weren't able to reserve enough space for the transaction.
+ * Flush the other speculative space allocations to free space.
+ * Do not perform a synchronous scan because callers can hold
+ * other locks.
+ */
+ xfs_blockgc_flush_all(mp);
+ want_retry = false;
+ goto retry;
+ }
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -297,15 +307,19 @@ xfs_trans_alloc(
/*
* Create an empty transaction with no reservation. This is a defensive
- * mechanism for routines that query metadata without actually modifying
- * them -- if the metadata being queried is somehow cross-linked (think a
- * btree block pointer that points higher in the tree), we risk deadlock.
- * However, blocks grabbed as part of a transaction can be re-grabbed.
- * The verifiers will notice the corrupt block and the operation will fail
- * back to userspace without deadlocking.
+ * mechanism for routines that query metadata without actually modifying them --
+ * if the metadata being queried is somehow cross-linked (think a btree block
+ * pointer that points higher in the tree), we risk deadlock. However, blocks
+ * grabbed as part of a transaction can be re-grabbed. The verifiers will
+ * notice the corrupt block and the operation will fail back to userspace
+ * without deadlocking.
*
- * Note the zero-length reservation; this transaction MUST be cancelled
- * without any dirty data.
+ * Note the zero-length reservation; this transaction MUST be cancelled without
+ * any dirty data.
+ *
+ * Callers should obtain freeze protection to avoid a conflict with fs freezing
+ * where we can be grabbing buffers at the same time that freeze is trying to
+ * drain the buffer LRU list.
*/
int
xfs_trans_alloc_empty(
@@ -345,12 +359,12 @@ xfs_trans_mod_sb(
switch (field) {
case XFS_TRANS_SB_ICOUNT:
tp->t_icount_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_IFREE:
tp->t_ifree_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_FDBLOCKS:
@@ -363,9 +377,23 @@ xfs_trans_mod_sb(
tp->t_blk_res_used += (uint)-delta;
if (tp->t_blk_res_used > tp->t_blk_res)
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) {
+ int64_t blkres_delta;
+
+ /*
+ * Return freed blocks directly to the reservation
+ * instead of the global pool, being careful not to
+ * overflow the trans counter. This is used to preserve
+ * reservation across chains of transaction rolls that
+ * repeatedly free and allocate blocks.
+ */
+ blkres_delta = min_t(int64_t, delta,
+ UINT_MAX - tp->t_blk_res);
+ tp->t_blk_res += blkres_delta;
+ delta -= blkres_delta;
}
tp->t_fdblocks_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_RES_FDBLOCKS:
@@ -375,7 +403,7 @@ xfs_trans_mod_sb(
* be applied to the on-disk superblock.
*/
tp->t_res_fdblocks_delta += delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ if (xfs_has_lazysbcount(mp))
flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_FREXTENTS:
@@ -400,7 +428,6 @@ xfs_trans_mod_sb(
tp->t_res_frextents_delta += delta;
break;
case XFS_TRANS_SB_DBLOCKS:
- ASSERT(delta > 0);
tp->t_dblocks_delta += delta;
break;
case XFS_TRANS_SB_AGCOUNT:
@@ -445,24 +472,17 @@ STATIC void
xfs_trans_apply_sb_deltas(
xfs_trans_t *tp)
{
- xfs_dsb_t *sbp;
- xfs_buf_t *bp;
+ struct xfs_dsb *sbp;
+ struct xfs_buf *bp;
int whole = 0;
- bp = xfs_trans_getsb(tp, tp->t_mountp);
- sbp = XFS_BUF_TO_SBP(bp);
-
- /*
- * Check that superblock mods match the mods made to AGF counters.
- */
- ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
- (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
- tp->t_ag_btree_delta));
+ bp = xfs_trans_getsb(tp);
+ sbp = bp->b_addr;
/*
* Only update the superblock counters if we are logging them
*/
- if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
+ if (!xfs_has_lazysbcount((tp->t_mountp))) {
if (tp->t_icount_delta)
be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
if (tp->t_ifree_delta)
@@ -473,10 +493,31 @@ xfs_trans_apply_sb_deltas(
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
}
- if (tp->t_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
- if (tp->t_res_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+ /*
+ * Updating frextents requires careful handling because it does not
+ * behave like the lazysb counters because we cannot rely on log
+ * recovery in older kenels to recompute the value from the rtbitmap.
+ * This means that the ondisk frextents must be consistent with the
+ * rtbitmap.
+ *
+ * Therefore, log the frextents change to the ondisk superblock and
+ * update the incore superblock so that future calls to xfs_log_sb
+ * write the correct value ondisk.
+ *
+ * Don't touch m_frextents because it includes incore reservations,
+ * and those are handled by the unreserve function.
+ */
+ if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+ struct xfs_mount *mp = tp->t_mountp;
+ int64_t rtxdelta;
+
+ rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+ spin_lock(&mp->m_sb_lock);
+ be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+ mp->m_sb.sb_frextents += rtxdelta;
+ spin_unlock(&mp->m_sb_lock);
+ }
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -516,68 +557,20 @@ xfs_trans_apply_sb_deltas(
/*
* Log the whole thing, the fields are noncontiguous.
*/
- xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
else
/*
* Since all the modifiable fields are contiguous, we
* can get away with this.
*/
- xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
- offsetof(xfs_dsb_t, sb_frextents) +
+ xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount),
+ offsetof(struct xfs_dsb, sb_frextents) +
sizeof(sbp->sb_frextents) - 1);
}
-STATIC int
-xfs_sb_mod8(
- uint8_t *field,
- int8_t delta)
-{
- int8_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
-STATIC int
-xfs_sb_mod32(
- uint32_t *field,
- int32_t delta)
-{
- int32_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
-STATIC int
-xfs_sb_mod64(
- uint64_t *field,
- int64_t delta)
-{
- int64_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
/*
- * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
- * and apply superblock counter changes to the in-core superblock. The
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and
+ * apply superblock counter changes to the in-core superblock. The
* t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
* applied to the in-core superblock. The idea is that that has already been
* done.
@@ -586,7 +579,12 @@ xfs_sb_mod64(
* used block counts are not updated in the on disk superblock. In this case,
* XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
* still need to update the incore superblock with the changes.
+ *
+ * Deltas for the inode count are +/-64, hence we use a large batch size of 128
+ * so we don't need to take the counter lock on every update.
*/
+#define XFS_ICOUNT_BATCH 128
+
void
xfs_trans_unreserve_and_mod_sb(
struct xfs_trans *tp)
@@ -603,7 +601,7 @@ xfs_trans_unreserve_and_mod_sb(
if (tp->t_blk_res > 0)
blkdelta = tp->t_blk_res;
if ((tp->t_fdblocks_delta != 0) &&
- (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+ (xfs_has_lazysbcount(mp) ||
(tp->t_flags & XFS_TRANS_SB_DIRTY)))
blkdelta += tp->t_fdblocks_delta;
@@ -613,7 +611,7 @@ xfs_trans_unreserve_and_mod_sb(
(tp->t_flags & XFS_TRANS_SB_DIRTY))
rtxdelta += tp->t_frextents_delta;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+ if (xfs_has_lazysbcount(mp) ||
(tp->t_flags & XFS_TRANS_SB_DIRTY)) {
idelta = tp->t_icount_delta;
ifreedelta = tp->t_ifree_delta;
@@ -622,116 +620,51 @@ xfs_trans_unreserve_and_mod_sb(
/* apply the per-cpu counters */
if (blkdelta) {
error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
- if (error)
- goto out;
+ ASSERT(!error);
}
- if (idelta) {
- error = xfs_mod_icount(mp, idelta);
- if (error)
- goto out_undo_fdblocks;
- }
+ if (idelta)
+ percpu_counter_add_batch(&mp->m_icount, idelta,
+ XFS_ICOUNT_BATCH);
- if (ifreedelta) {
- error = xfs_mod_ifree(mp, ifreedelta);
- if (error)
- goto out_undo_icount;
+ if (ifreedelta)
+ percpu_counter_add(&mp->m_ifree, ifreedelta);
+
+ if (rtxdelta) {
+ error = xfs_mod_frextents(mp, rtxdelta);
+ ASSERT(!error);
}
- if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+ if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
/* apply remaining deltas */
spin_lock(&mp->m_sb_lock);
- if (rtxdelta) {
- error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
- if (error)
- goto out_undo_ifree;
- }
-
- if (tp->t_dblocks_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
- if (error)
- goto out_undo_frextents;
- }
- if (tp->t_agcount_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
- if (error)
- goto out_undo_dblocks;
- }
- if (tp->t_imaxpct_delta != 0) {
- error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
- if (error)
- goto out_undo_agcount;
- }
- if (tp->t_rextsize_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
- tp->t_rextsize_delta);
- if (error)
- goto out_undo_imaxpct;
- }
- if (tp->t_rbmblocks_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
- tp->t_rbmblocks_delta);
- if (error)
- goto out_undo_rextsize;
- }
- if (tp->t_rblocks_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
- if (error)
- goto out_undo_rbmblocks;
- }
- if (tp->t_rextents_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
- tp->t_rextents_delta);
- if (error)
- goto out_undo_rblocks;
- }
- if (tp->t_rextslog_delta != 0) {
- error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
- tp->t_rextslog_delta);
- if (error)
- goto out_undo_rextents;
- }
+ mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
+ mp->m_sb.sb_icount += idelta;
+ mp->m_sb.sb_ifree += ifreedelta;
+ /*
+ * Do not touch sb_frextents here because we are dealing with incore
+ * reservation. sb_frextents is not part of the lazy sb counters so it
+ * must be consistent with the ondisk rtbitmap and must never include
+ * incore reservations.
+ */
+ mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
+ mp->m_sb.sb_agcount += tp->t_agcount_delta;
+ mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
+ mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+ mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
+ mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
+ mp->m_sb.sb_rextents += tp->t_rextents_delta;
+ mp->m_sb.sb_rextslog += tp->t_rextslog_delta;
spin_unlock(&mp->m_sb_lock);
- return;
-out_undo_rextents:
- if (tp->t_rextents_delta)
- xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
-out_undo_rblocks:
- if (tp->t_rblocks_delta)
- xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
-out_undo_rbmblocks:
- if (tp->t_rbmblocks_delta)
- xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
-out_undo_rextsize:
- if (tp->t_rextsize_delta)
- xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
-out_undo_imaxpct:
- if (tp->t_rextsize_delta)
- xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
-out_undo_agcount:
- if (tp->t_agcount_delta)
- xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
-out_undo_dblocks:
- if (tp->t_dblocks_delta)
- xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
-out_undo_frextents:
- if (rtxdelta)
- xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
-out_undo_ifree:
- spin_unlock(&mp->m_sb_lock);
- if (ifreedelta)
- xfs_mod_ifree(mp, -ifreedelta);
-out_undo_icount:
- if (idelta)
- xfs_mod_icount(mp, -idelta);
-out_undo_fdblocks:
- if (blkdelta)
- xfs_mod_fdblocks(mp, -blkdelta, rsvd);
-out:
- ASSERT(error == 0);
+ /*
+ * Debug checks outside of the spinlock so they don't lock up the
+ * machine if they fail.
+ */
+ ASSERT(mp->m_sb.sb_imax_pct >= 0);
+ ASSERT(mp->m_sb.sb_rextslog >= 0);
return;
}
@@ -741,7 +674,7 @@ xfs_trans_add_item(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
- ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_log == tp->t_mountp->m_log);
ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
ASSERT(list_empty(&lip->li_trans));
ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
@@ -827,7 +760,7 @@ xfs_log_item_batch_insert(
void
xfs_trans_committed_bulk(
struct xfs_ail *ailp,
- struct xfs_log_vec *log_vector,
+ struct list_head *lv_chain,
xfs_lsn_t commit_lsn,
bool aborted)
{
@@ -842,7 +775,7 @@ xfs_trans_committed_bulk(
spin_unlock(&ailp->ail_lock);
/* unpin all the log items */
- for (lv = log_vector; lv; lv = lv->lv_next ) {
+ list_for_each_entry(lv, lv_chain, lv_list) {
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
@@ -868,7 +801,7 @@ xfs_trans_committed_bulk(
* object into the AIL as we are in a shutdown situation.
*/
if (aborted) {
- ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ ASSERT(xlog_is_shutdown(ailp->ail_log));
if (lip->li_ops->iop_unpin)
lip->li_ops->iop_unpin(lip, 1);
continue;
@@ -912,6 +845,90 @@ xfs_trans_committed_bulk(
}
/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last.
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+ void *unused_arg,
+ const struct list_head *a,
+ const struct list_head *b)
+{
+ struct xfs_log_item *lia = container_of(a,
+ struct xfs_log_item, li_trans);
+ struct xfs_log_item *lib = container_of(b,
+ struct xfs_log_item, li_trans);
+ int64_t diff;
+
+ /*
+ * If both items are non-sortable, leave them alone. If only one is
+ * sortable, move the non-sortable item towards the end of the list.
+ */
+ if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+ return 0;
+ if (!lia->li_ops->iop_sort)
+ return 1;
+ if (!lib->li_ops->iop_sort)
+ return -1;
+
+ diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+ if (diff < 0)
+ return -1;
+ if (diff > 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_log_item *lip, *n;
+ int error = 0;
+
+ /*
+ * Sort the item list to avoid ABBA deadlocks with other transactions
+ * running precommit operations that lock multiple shared items such as
+ * inode cluster buffers.
+ */
+ list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+ /*
+ * Precommit operations can remove the log item from the transaction
+ * if the log item exists purely to delay modifications until they
+ * can be ordered against other operations. Hence we have to use
+ * list_for_each_entry_safe() here.
+ */
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ continue;
+ if (lip->li_ops->iop_precommit) {
+ error = lip->li_ops->iop_precommit(tp, lip);
+ if (error)
+ break;
+ }
+ }
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+}
+
+/*
* Commit the given transaction to the log.
*
* XFS disk error handling mechanism is not based on a typical
@@ -929,12 +946,20 @@ __xfs_trans_commit(
bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
- xfs_lsn_t commit_lsn = -1;
+ struct xlog *log = mp->m_log;
+ xfs_csn_t commit_seq = 0;
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
trace_xfs_trans_commit(tp, _RET_IP_);
+ error = xfs_trans_run_precommits(tp);
+ if (error) {
+ if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+ xfs_defer_cancel(tp);
+ goto out_unreserve;
+ }
+
/*
* Finish deferred items on final commit. Only permanent transactions
* should ever have deferred ops.
@@ -957,7 +982,13 @@ __xfs_trans_commit(
if (!(tp->t_flags & XFS_TRANS_DIRTY))
goto out_unreserve;
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ /*
+ * We must check against log shutdown here because we cannot abort log
+ * items and leave them dirty, inconsistent and unpinned in memory while
+ * the log is active. This leaves them open to being written back to
+ * disk, and that will lead to on-disk corruption.
+ */
+ if (xlog_is_shutdown(log)) {
error = -EIO;
goto out_unreserve;
}
@@ -971,9 +1002,8 @@ __xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
+ xlog_cil_commit(log, tp, &commit_seq, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -981,7 +1011,7 @@ __xfs_trans_commit(
* log out now and wait for it.
*/
if (sync) {
- error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
+ error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL);
XFS_STATS_INC(mp, xs_trans_sync);
} else {
XFS_STATS_INC(mp, xs_trans_async);
@@ -999,12 +1029,12 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
- if (commit_lsn == -1 && !error)
- error = -EIO;
+ if (regrant && !xlog_is_shutdown(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
}
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
@@ -1020,53 +1050,70 @@ xfs_trans_commit(
}
/*
- * Unlock all of the transaction's items and free the transaction.
- * The transaction must not have modified any of its items, because
- * there is no way to restore them to their previous state.
+ * Unlock all of the transaction's items and free the transaction. If the
+ * transaction is dirty, we must shut down the filesystem because there is no
+ * way to restore them to their previous state.
*
- * If the transaction has made a log reservation, make sure to release
- * it as well.
+ * If the transaction has made a log reservation, make sure to release it as
+ * well.
+ *
+ * This is a high level function (equivalent to xfs_trans_commit()) and so can
+ * be called after the transaction has effectively been aborted due to the mount
+ * being shut down. However, if the mount has not been shut down and the
+ * transaction is dirty we will shut the mount down and, in doing so, that
+ * guarantees that the log is shut down, too. Hence we don't need to be as
+ * careful with shutdown state and dirty items here as we need to be in
+ * xfs_trans_commit().
*/
void
xfs_trans_cancel(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xlog *log = mp->m_log;
bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
trace_xfs_trans_cancel(tp, _RET_IP_);
- if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+ /*
+ * It's never valid to cancel a transaction with deferred ops attached,
+ * because the transaction is effectively dirty. Complain about this
+ * loudly before freeing the in-memory defer items.
+ */
+ if (!list_empty(&tp->t_dfops)) {
+ ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops));
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ dirty = true;
xfs_defer_cancel(tp);
+ }
/*
- * See if the caller is relying on us to shut down the
- * filesystem. This happens in paths where we detect
- * corruption and decide to give up.
+ * See if the caller is relying on us to shut down the filesystem. We
+ * only want an error report if there isn't already a shutdown in
+ * progress, so we only need to check against the mount shutdown state
+ * here.
*/
- if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (dirty && !xfs_is_shutdown(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
+ /* Log items need to be consistent until the log is shut down. */
+ if (!dirty && !xlog_is_shutdown(log)) {
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
- ASSERT(!(lip->li_type == XFS_LI_EFD));
+ ASSERT(!xlog_item_is_intent_done(lip));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_done(mp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
}
- /* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
@@ -1118,3 +1165,269 @@ xfs_trans_roll(
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
return xfs_trans_reserve(*tpp, &tres, 0, 0);
}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The caller is responsible for
+ * releasing ILOCK_EXCL if a new transaction is returned.
+ */
+int
+xfs_trans_alloc_inode(
+ struct xfs_inode *ip,
+ struct xfs_trans_res *resv,
+ unsigned int dblocks,
+ unsigned int rblocks,
+ bool force,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, resv, dblocks,
+ rblocks / mp->m_sb.sb_rextsize,
+ force ? XFS_TRANS_RESERVE : 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_blockgc_free_quota(ip, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+
+ *tpp = tp;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Allocate an transaction in preparation for inode creation by reserving quota
+ * against the given dquots. Callers are not required to hold any inode locks.
+ */
+int
+xfs_trans_alloc_icreate(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *resv,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ unsigned int dblocks,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp);
+ if (error)
+ return error;
+
+ error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ *tpp = tp;
+ return 0;
+}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota
+ * in preparation for inode attribute changes that include uid, gid, or prid
+ * changes.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCK will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_ichange(
+ struct xfs_inode *ip,
+ struct xfs_dquot *new_udqp,
+ struct xfs_dquot *new_gdqp,
+ struct xfs_dquot *new_pdqp,
+ bool force,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ /*
+ * For each quota type, skip quota reservations if the inode's dquots
+ * now match the ones that came from the caller, or the caller didn't
+ * pass one in. The inode's dquots can change if we drop the ILOCK to
+ * perform a blockgc scan, so we must preserve the caller's arguments.
+ */
+ udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL;
+ gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL;
+ pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL;
+ if (udqp || gdqp || pdqp) {
+ unsigned int qflags = XFS_QMOPT_RES_REGBLKS;
+
+ if (force)
+ qflags |= XFS_QMOPT_FORCE_RES;
+
+ /*
+ * Reserve enough quota to handle blocks on disk and reserved
+ * for a delayed allocation. We'll actually transfer the
+ * delalloc reservation between dquots at chown time, even
+ * though that part is only semi-transactional.
+ */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
+ pdqp, ip->i_nblocks + ip->i_delayed_blks,
+ 1, qflags);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+ }
+
+ *tpp = tp;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update. If there isn't sufficient space,
+ * @dblocks will be set to zero for a reservationless directory update and
+ * @nospace_error will be set to a negative errno describing the space
+ * constraint we hit.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCKs will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_dir(
+ struct xfs_inode *dp,
+ struct xfs_trans_res *resv,
+ struct xfs_inode *ip,
+ unsigned int *dblocks,
+ struct xfs_trans **tpp,
+ int *nospace_error)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int resblks;
+ bool retried = false;
+ int error;
+
+retry:
+ *nospace_error = 0;
+ resblks = *dblocks;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ if (error == -ENOSPC) {
+ *nospace_error = error;
+ resblks = 0;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ }
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(dp, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ if (resblks == 0)
+ goto done;
+
+ error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ *nospace_error = error;
+ resblks = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_cancel;
+
+done:
+ *tpp = tp;
+ *dblocks = resblks;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 752c7fef9de7..55819785941c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -8,6 +8,7 @@
/* kernel only transaction subsystem defines */
+struct xlog;
struct xfs_buf;
struct xfs_buftarg;
struct xfs_efd_log_item;
@@ -31,23 +32,20 @@ struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_mount *li_mountp; /* ptr to fs mount */
+ struct xlog *li_log;
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
unsigned long li_flags; /* misc flags */
struct xfs_buf *li_buf; /* real buffer pointer */
struct list_head li_bio_list; /* buffer item list */
- void (*li_cb)(struct xfs_buf *,
- struct xfs_log_item *);
- /* buffer item iodone */
- /* callback func */
const struct xfs_item_ops *li_ops; /* function list */
/* delayed logging */
struct list_head li_cil; /* CIL pointers */
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
- xfs_lsn_t li_seq; /* CIL commit seq */
+ xfs_csn_t li_seq; /* CIL commit seq */
+ uint32_t li_order_id; /* CIL commit order */
};
/*
@@ -58,13 +56,15 @@ struct xfs_log_item {
#define XFS_LI_IN_AIL 0
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
-#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
+#define XFS_LI_DIRTY 3
+#define XFS_LI_WHITEOUT 4
#define XFS_LI_FLAGS \
- { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
- { (1 << XFS_LI_ABORTED), "ABORTED" }, \
- { (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }
+ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1u << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1u << XFS_LI_FAILED), "FAILED" }, \
+ { (1u << XFS_LI_DIRTY), "DIRTY" }, \
+ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
struct xfs_item_ops {
unsigned flags;
@@ -72,18 +72,42 @@ struct xfs_item_ops {
void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
void (*iop_pin)(struct xfs_log_item *);
void (*iop_unpin)(struct xfs_log_item *, int remove);
+ uint64_t (*iop_sort)(struct xfs_log_item *lip);
+ int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip);
+ void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
+ xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
- void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
void (*iop_release)(struct xfs_log_item *);
- xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
- void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
+ int (*iop_recover)(struct xfs_log_item *lip,
+ struct list_head *capture_list);
+ bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
+ struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
+ struct xfs_trans *tp);
+ struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
/*
- * Release the log item as soon as committed. This is for items just logging
- * intents that never need to be written back in place.
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
*/
#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+#define XFS_ITEM_INTENT (1 << 1)
+#define XFS_ITEM_INTENT_DONE (1 << 2)
+
+static inline bool
+xlog_item_is_intent(struct xfs_log_item *lip)
+{
+ return lip->li_ops->flags & XFS_ITEM_INTENT;
+}
+
+static inline bool
+xlog_item_is_intent_done(struct xfs_log_item *lip)
+{
+ return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
+}
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
@@ -97,12 +121,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
#define XFS_ITEM_FLUSHING 3
/*
- * Deferred operation item relogging limits.
- */
-#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
-#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
-
-/*
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
@@ -124,11 +142,6 @@ typedef struct xfs_trans {
int64_t t_res_fdblocks_delta; /* on-disk only chg */
int64_t t_frextents_delta;/* superblock freextents chg*/
int64_t t_res_frextents_delta; /* on-disk only chg */
-#if defined(DEBUG) || defined(XFS_WARN)
- int64_t t_ag_freeblks_delta; /* debugging counter */
- int64_t t_ag_flist_delta; /* debugging counter */
- int64_t t_ag_btree_delta; /* debugging counter */
-#endif
int64_t t_dblocks_delta;/* superblock dblocks change */
int64_t t_agcount_delta;/* superblock agcount change */
int64_t t_imaxpct_delta;/* superblock imaxpct change */
@@ -149,16 +162,6 @@ typedef struct xfs_trans {
*/
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
-#if defined(DEBUG) || defined(XFS_WARN)
-#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d)
-#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d)
-#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d)
-#else
-#define xfs_trans_agblocks_delta(tp, d)
-#define xfs_trans_agflist_delta(tp, d)
-#define xfs_trans_agbtree_delta(tp, d)
-#endif
-
/*
* XFS transaction mechanism exported interfaces.
*/
@@ -179,7 +182,7 @@ xfs_trans_get_buf(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
@@ -210,7 +213,7 @@ xfs_trans_read_buf(
flags, bpp, ops);
}
-struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
+struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
@@ -242,6 +245,60 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
struct xfs_buf *src_bp);
-extern kmem_zone_t *xfs_trans_zone;
+extern struct kmem_cache *xfs_trans_cache;
+
+static inline struct xfs_log_item *
+xfs_trans_item_relog(
+ struct xfs_log_item *lip,
+ struct xfs_trans *tp)
+{
+ return lip->li_ops->iop_relog(lip, tp);
+}
+
+struct xfs_dquot;
+
+int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
+ unsigned int dblocks, unsigned int rblocks, bool force,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
+ struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp, unsigned int dblocks,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+ struct xfs_inode *ip, unsigned int *dblocks,
+ struct xfs_trans **tpp, int *nospace_error);
+
+static inline void
+xfs_trans_set_context(
+ struct xfs_trans *tp)
+{
+ ASSERT(current->journal_info == NULL);
+ tp->t_pflags = memalloc_nofs_save();
+ current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+ struct xfs_trans *tp)
+{
+ if (current->journal_info == tp) {
+ memalloc_nofs_restore(tp->t_pflags);
+ current->journal_info = NULL;
+ }
+}
+
+static inline void
+xfs_trans_switch_context(
+ struct xfs_trans *old_tp,
+ struct xfs_trans *new_tp)
+{
+ ASSERT(current->journal_info == old_tp);
+ new_tp->t_pflags = old_tp->t_pflags;
+ old_tp->t_pflags = 0;
+ current->journal_info = new_tp;
+}
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 00cc5b8734be..f51df7d94ef7 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -17,6 +17,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#ifdef DEBUG
/*
@@ -32,6 +33,7 @@ STATIC void
xfs_ail_check(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
+ __must_hold(&ailp->ail_lock)
{
struct xfs_log_item *prev_lip;
struct xfs_log_item *next_lip;
@@ -108,17 +110,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip = xfs_ail_min(ailp);
+
+ if (lip)
+ return lip->li_lsn;
+ return 0;
+}
+
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
+ xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
- lip = xfs_ail_min(ailp);
- if (lip)
- lsn = lip->li_lsn;
+ lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
@@ -336,6 +346,49 @@ xfs_ail_delete(
xfs_trans_ail_cursor_clear(ailp, lip);
}
+/*
+ * Requeue a failed buffer for writeback.
+ *
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ */
+static inline int
+xfsaild_resubmit_item(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ struct xfs_buf *bp = lip->li_buf;
+
+ if (!xfs_buf_trylock(bp))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_buf_delwri_queue(bp, buffer_list)) {
+ xfs_buf_unlock(bp);
+ return XFS_ITEM_FLUSHING;
+ }
+
+ /* protected by ail_lock */
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+ if (bp->b_flags & _XBF_INODES)
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
+ else
+ xfs_clear_li_failed(lip);
+ }
+
+ xfs_buf_unlock(bp);
+ return XFS_ITEM_SUCCESS;
+}
+
static inline uint
xfsaild_push_item(
struct xfs_ail *ailp,
@@ -345,7 +398,7 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
/*
@@ -356,6 +409,8 @@ xfsaild_push_item(
*/
if (!lip->li_ops->iop_push)
return XFS_ITEM_PINNED;
+ if (test_bit(XFS_LI_FAILED, &lip->li_flags))
+ return xfsaild_resubmit_item(lip, &ailp->ail_buf_list);
return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
}
@@ -363,7 +418,7 @@ static long
xfsaild_push(
struct xfs_ail *ailp)
{
- xfs_mount_t *mp = ailp->ail_mount;
+ struct xfs_mount *mp = ailp->ail_log->l_mp;
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
@@ -375,8 +430,12 @@ xfsaild_push(
/*
* If we encountered pinned items or did not finish writing out all
- * buffers the last time we ran, force the log first and wait for it
- * before pushing again.
+ * buffers the last time we ran, force a background CIL push to get the
+ * items unpinned in the near future. We do not wait on the CIL push as
+ * that could stall us for seconds if there is enough background IO
+ * load. Stalling for that long when the tail of the log is pinned and
+ * needs flushing will hard stop the transaction subsystem when log
+ * space runs out.
*/
if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 &&
(!list_empty_careful(&ailp->ail_buf_list) ||
@@ -384,26 +443,32 @@ xfsaild_push(
ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush);
- xfs_log_force(mp, XFS_LOG_SYNC);
+ xlog_cil_flush(ailp->ail_log);
}
spin_lock(&ailp->ail_lock);
- /* barrier matches the ail_target update in xfs_ail_push() */
- smp_rmb();
- target = ailp->ail_target;
- ailp->ail_target_prev = target;
+ /*
+ * If we have a sync push waiter, we always have to push till the AIL is
+ * empty. Update the target to point to the end of the AIL so that
+ * capture updates that occur after the sync push waiter has gone to
+ * sleep.
+ */
+ if (waitqueue_active(&ailp->ail_empty)) {
+ lip = xfs_ail_max(ailp);
+ if (lip)
+ target = lip->li_lsn;
+ } else {
+ /* barrier matches the ail_target update in xfs_ail_push() */
+ smp_rmb();
+ target = ailp->ail_target;
+ ailp->ail_target_prev = target;
+ }
+ /* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
- if (!lip) {
- /*
- * If the AIL is empty or our push has reached the end we are
- * done now.
- */
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ if (!lip)
goto out_done;
- }
XFS_STATS_INC(mp, xs_push_ail);
@@ -432,7 +497,7 @@ xfsaild_push(
* inode buffer is locked because we already pushed the
* updates to it as part of inode clustering.
*
- * We do not want to to stop flushing just because lots
+ * We do not want to stop flushing just because lots
* of items are already being flushed, but we need to
* re-try the flushing relatively soon if most of the
* AIL is being flushed.
@@ -467,7 +532,7 @@ xfsaild_push(
/*
* Are there too many items we can't do anything with?
*
- * If we we are skipping too many items because we can't flush
+ * If we are skipping too many items because we can't flush
* them or they are already being flushed, we back off and
* given them time to complete whatever operation is being
* done. i.e. remove pressure from the AIL while we can't make
@@ -485,6 +550,8 @@ xfsaild_push(
break;
lsn = lip->li_lsn;
}
+
+out_done:
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
@@ -492,7 +559,6 @@ xfsaild_push(
ailp->ail_log_flush++;
if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
-out_done:
/*
* We reached the target or the AIL is empty, so wait a bit
* longer for I/O to complete and remove pushed items from the
@@ -529,15 +595,16 @@ xfsaild(
{
struct xfs_ail *ailp = data;
long tout = 0; /* milliseconds */
+ unsigned int noreclaim_flag;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
set_freezable();
while (1) {
if (tout && tout <= 20)
- set_current_state(TASK_KILLABLE);
+ set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
else
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
/*
* Check kthread_should_stop() after we set the task state to
@@ -565,7 +632,7 @@ xfsaild(
* opportunity to release such buffers from the queue.
*/
ASSERT(list_empty(&ailp->ail_buf_list) ||
- XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ xlog_is_shutdown(ailp->ail_log));
xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
@@ -583,16 +650,17 @@ xfsaild(
*/
smp_rmb();
if (!xfs_ail_min(ailp) &&
- ailp->ail_target == ailp->ail_target_prev) {
+ ailp->ail_target == ailp->ail_target_prev &&
+ list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
- freezable_schedule();
+ schedule();
tout = 0;
continue;
}
spin_unlock(&ailp->ail_lock);
if (tout)
- freezable_schedule_timeout(msecs_to_jiffies(tout));
+ schedule_timeout(msecs_to_jiffies(tout));
__set_current_state(TASK_RUNNING);
@@ -601,6 +669,7 @@ xfsaild(
tout = xfsaild_push(ailp);
}
+ memalloc_noreclaim_restore(noreclaim_flag);
return 0;
}
@@ -626,7 +695,7 @@ xfs_ail_push(
struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
- if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) ||
+ if (!lip || xlog_is_shutdown(ailp->ail_log) ||
XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
return;
@@ -661,13 +730,11 @@ void
xfs_ail_push_all_sync(
struct xfs_ail *ailp)
{
- struct xfs_log_item *lip;
DEFINE_WAIT(wait);
spin_lock(&ailp->ail_lock);
- while ((lip = xfs_ail_max(ailp)) != NULL) {
+ while (xfs_ail_max(ailp) != NULL) {
prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
- ailp->ail_target = lip->li_lsn;
wake_up_process(ailp->ail_task);
spin_unlock(&ailp->ail_lock);
schedule();
@@ -678,6 +745,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
+void
+xfs_ail_update_finish(
+ struct xfs_ail *ailp,
+ xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
+{
+ struct xlog *log = ailp->ail_log;
+
+ /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+ spin_unlock(&ailp->ail_lock);
+ return;
+ }
+
+ if (!xlog_is_shutdown(log))
+ xlog_assign_tail_lsn_locked(log->l_mp);
+
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
+ spin_unlock(&ailp->ail_lock);
+ xfs_log_space_wake(log->l_mp);
+}
+
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
@@ -709,7 +798,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
- int mlip_changed = 0;
+ xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
@@ -724,9 +813,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+ if (mlip == lip && !tail_lsn)
+ tail_lsn = lip->li_lsn;
+
xfs_ail_delete(ailp, lip);
- if (mlip == lip)
- mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
@@ -737,85 +827,70 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- spin_unlock(&ailp->ail_lock);
+ xfs_ail_update_finish(ailp, tail_lsn);
+}
- xfs_log_space_wake(ailp->ail_mount);
- } else {
- spin_unlock(&ailp->ail_lock);
- }
+/* Insert a log item into the AIL. */
+void
+xfs_trans_ail_insert(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ spin_lock(&ailp->ail_lock);
+ xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
+ xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
- xfs_clear_li_failed(lip);
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
- return mlip == lip;
+ if (mlip == lip)
+ return lsn;
+ return 0;
}
-/**
- * Remove a log items from the AIL
- *
- * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * removed from the AIL. The caller is already holding the AIL lock, and done
- * all the checks necessary to ensure the items passed in via @log_items are
- * ready for deletion. This includes checking that the items are in the AIL.
- *
- * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
- * flag from the item and reset the item's lsn to 0. If we remove the first
- * item in the AIL, update the log tail to match the new minimum LSN in the
- * AIL.
- *
- * This function will not drop the AIL lock until all items are removed from
- * the AIL to minimise the amount of lock traffic on the AIL. This does not
- * greatly increase the AIL hold time, but does significantly reduce the amount
- * of traffic on the lock, especially during IO completion.
- *
- * This function must be called with the AIL lock held. The lock is dropped
- * before returning.
- */
void
xfs_trans_ail_delete(
- struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock)
+ int shutdown_type)
{
- struct xfs_mount *mp = ailp->ail_mount;
- bool mlip_changed;
+ struct xfs_ail *ailp = lip->li_ailp;
+ struct xlog *log = ailp->ail_log;
+ xfs_lsn_t tail_lsn;
+ spin_lock(&ailp->ail_lock);
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ if (shutdown_type && !xlog_is_shutdown(log)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
- xfs_force_shutdown(mp, shutdown_type);
+ xlog_force_shutdown(log, shutdown_type);
}
return;
}
- mlip_changed = xfs_ail_delete_one(ailp, lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
-
- spin_unlock(&ailp->ail_lock);
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ /* xfs_ail_update_finish() drops the AIL lock */
+ xfs_clear_li_failed(lip);
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
int
@@ -828,7 +903,7 @@ xfs_trans_ail_init(
if (!ailp)
return -ENOMEM;
- ailp->ail_mount = mp;
+ ailp->ail_log = mp->m_log;
INIT_LIST_HEAD(&ailp->ail_head);
INIT_LIST_HEAD(&ailp->ail_cursors);
spin_lock_init(&ailp->ail_lock);
@@ -836,7 +911,7 @@ xfs_trans_ail_init(
init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->ail_mount->m_super->s_id);
+ mp->m_super->s_id);
if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 08174ffa2118..6549e50d852c 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -38,7 +38,7 @@ xfs_trans_buf_item_match(
blip = (struct xfs_buf_log_item *)lip;
if (blip->bli_item.li_type == XFS_LI_BUF &&
blip->bli_buf->b_target == target &&
- XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn &&
+ xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn &&
blip->bli_buf->b_length == len) {
ASSERT(blip->bli_buf->b_map_count == nmaps);
return blip->bli_buf;
@@ -121,7 +121,7 @@ xfs_trans_get_buf_map(
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- xfs_buf_t *bp;
+ struct xfs_buf *bp;
struct xfs_buf_log_item *bip;
int error;
@@ -138,7 +138,7 @@ xfs_trans_get_buf_map(
bp = xfs_trans_buf_item_match(tp, target, map, nmaps);
if (bp != NULL) {
ASSERT(xfs_buf_islocked(bp));
- if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+ if (xfs_is_shutdown(tp->t_mountp)) {
xfs_buf_stale(bp);
bp->b_flags |= XBF_DONE;
}
@@ -166,50 +166,34 @@ xfs_trans_get_buf_map(
}
/*
- * Get and lock the superblock buffer of this file system for the
- * given transaction.
- *
- * We don't need to use incore_match() here, because the superblock
- * buffer is a private buffer which we keep a pointer to in the
- * mount structure.
+ * Get and lock the superblock buffer for the given transaction.
*/
-xfs_buf_t *
+struct xfs_buf *
xfs_trans_getsb(
- xfs_trans_t *tp,
- struct xfs_mount *mp)
+ struct xfs_trans *tp)
{
- xfs_buf_t *bp;
- struct xfs_buf_log_item *bip;
-
- /*
- * Default to just trying to lock the superblock buffer
- * if tp is NULL.
- */
- if (tp == NULL)
- return xfs_getsb(mp);
+ struct xfs_buf *bp = tp->t_mountp->m_sb_bp;
/*
- * If the superblock buffer already has this transaction
- * pointer in its b_fsprivate2 field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the buffer to the caller.
+ * Just increment the lock recursion count if the buffer is already
+ * attached to this transaction.
*/
- bp = mp->m_sb_bp;
if (bp->b_transp == tp) {
- bip = bp->b_log_item;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
+
trace_xfs_trans_getsb_recur(bip);
- return bp;
- }
+ } else {
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ _xfs_trans_bjoin(tp, bp, 1);
- bp = xfs_getsb(mp);
- if (bp == NULL)
- return NULL;
+ trace_xfs_trans_getsb(bp->b_log_item);
+ }
- _xfs_trans_bjoin(tp, bp, 1);
- trace_xfs_trans_getsb(bp->b_log_item);
return bp;
}
@@ -260,7 +244,7 @@ xfs_trans_read_buf_map(
* We never locked this buf ourselves, so we shouldn't
* brelse it either. Just get out.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
return -EIO;
}
@@ -310,13 +294,13 @@ xfs_trans_read_buf_map(
default:
if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
- /* fall through */
+ fallthrough;
case -ENOMEM:
case -EAGAIN:
return error;
}
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (xfs_is_shutdown(mp)) {
xfs_buf_relse(bp);
trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
return -EIO;
@@ -417,7 +401,7 @@ xfs_trans_brelse(
void
xfs_trans_bhold(
xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -438,7 +422,7 @@ xfs_trans_bhold(
void
xfs_trans_bhold_release(
xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -465,24 +449,16 @@ xfs_trans_dirty_buf(
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
- ASSERT(bp->b_iodone == NULL ||
- bp->b_iodone == xfs_buf_iodone_callbacks);
/*
* Mark the buffer as needing to be written out eventually,
* and set its iodone function to remove the buffer's buf log
* item from the AIL and free it when the buffer is flushed
- * to disk. See xfs_buf_attach_iodone() for more details
- * on li_cb and xfs_buf_iodone_callbacks().
- * If we end up aborting this transaction, we trap this buffer
- * inside the b_bdstrat callback so that this won't get written to
- * disk.
+ * to disk.
*/
bp->b_flags |= XBF_DONE;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- bp->b_iodone = xfs_buf_iodone_callbacks;
- bip->bli_item.li_cb = xfs_buf_iodone;
/*
* If we invalidated the buffer within this transaction, then
@@ -562,7 +538,7 @@ xfs_trans_log_buf(
void
xfs_trans_binval(
xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
int i;
@@ -617,7 +593,7 @@ xfs_trans_binval(
void
xfs_trans_inode_buf(
xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -626,6 +602,7 @@ xfs_trans_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_BUF;
+ bp->b_flags |= _XBF_INODES;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -641,7 +618,7 @@ xfs_trans_inode_buf(
void
xfs_trans_stale_inode_buf(
xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -650,7 +627,7 @@ xfs_trans_stale_inode_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_STALE_INODE;
- bip->bli_item.li_cb = xfs_buf_iodone;
+ bp->b_flags |= _XBF_INODES;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -666,7 +643,7 @@ xfs_trans_stale_inode_buf(
void
xfs_trans_inode_alloc_buf(
xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -675,6 +652,7 @@ xfs_trans_inode_alloc_buf(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+ bp->b_flags |= _XBF_INODES;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -759,7 +737,7 @@ xfs_trans_buf_copy_type(
void
xfs_trans_dquot_buf(
xfs_trans_t *tp,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
uint type)
{
struct xfs_buf_log_item *bip = bp->b_log_item;
@@ -785,5 +763,6 @@ xfs_trans_dquot_buf(
break;
}
+ bp->b_flags |= _XBF_DQUOTS;
xfs_trans_buf_set_type(tp, bp, type);
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index d1b9869bc5fa..aa00cf67ad72 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -15,6 +15,8 @@
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -54,6 +56,12 @@ xfs_trans_log_dquot(
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ /* Upgrade the dquot to bigtime format if possible. */
+ if (dqp->q_id != 0 &&
+ xfs_has_bigtime(tp->t_mountp) &&
+ !(dqp->q_type & XFS_DQTYPE_BIGTIME))
+ dqp->q_type |= XFS_DQTYPE_BIGTIME;
+
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
@@ -77,13 +85,6 @@ xfs_trans_dup_dqinfo(
xfs_trans_alloc_dqinfo(ntp);
- /*
- * Because the quota blk reservation is carried forward,
- * it is also necessary to carry forward the DQ_DIRTY flag.
- */
- if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
- ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
-
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
oqa = otp->t_dqinfo->dqs[j];
nqa = ntp->t_dqinfo->dqs[j];
@@ -131,14 +132,10 @@ xfs_trans_mod_dquot_byino(
{
xfs_mount_t *mp = tp->t_mountp;
- if (!XFS_IS_QUOTA_RUNNING(mp) ||
- !XFS_IS_QUOTA_ON(mp) ||
+ if (!XFS_IS_QUOTA_ON(mp) ||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
- if (tp->t_dqinfo == NULL)
- xfs_trans_alloc_dqinfo(tp);
-
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
@@ -155,14 +152,19 @@ xfs_trans_get_dqtrx(
int i;
struct xfs_dqtrx *qa;
- if (XFS_QM_ISUDQ(dqp))
+ switch (xfs_dquot_type(dqp)) {
+ case XFS_DQTYPE_USER:
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
- else if (XFS_QM_ISGDQ(dqp))
+ break;
+ case XFS_DQTYPE_GROUP:
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
- else if (XFS_QM_ISPDQ(dqp))
+ break;
+ case XFS_DQTYPE_PROJ:
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
- else
+ break;
+ default:
return NULL;
+ }
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
@@ -189,9 +191,12 @@ xfs_trans_mod_dquot(
struct xfs_dqtrx *qtrx;
ASSERT(tp);
- ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+ ASSERT(XFS_IS_QUOTA_ON(tp->t_mountp));
qtrx = NULL;
+ if (!delta)
+ return;
+
if (tp->t_dqinfo == NULL)
xfs_trans_alloc_dqinfo(tp);
/*
@@ -203,37 +208,31 @@ xfs_trans_mod_dquot(
if (qtrx->qt_dquot == NULL)
qtrx->qt_dquot = dqp;
- switch (field) {
+ trace_xfs_trans_mod_dquot_before(qtrx);
+ trace_xfs_trans_mod_dquot(tp, dqp, field, delta);
- /*
- * regular disk blk reservation
- */
- case XFS_TRANS_DQ_RES_BLKS:
+ switch (field) {
+ /* regular disk blk reservation */
+ case XFS_TRANS_DQ_RES_BLKS:
qtrx->qt_blk_res += delta;
break;
- /*
- * inode reservation
- */
- case XFS_TRANS_DQ_RES_INOS:
+ /* inode reservation */
+ case XFS_TRANS_DQ_RES_INOS:
qtrx->qt_ino_res += delta;
break;
- /*
- * disk blocks used.
- */
- case XFS_TRANS_DQ_BCOUNT:
+ /* disk blocks used. */
+ case XFS_TRANS_DQ_BCOUNT:
qtrx->qt_bcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
qtrx->qt_delbcnt_delta += delta;
break;
- /*
- * Inode Count
- */
- case XFS_TRANS_DQ_ICOUNT:
+ /* Inode Count */
+ case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
@@ -241,17 +240,13 @@ xfs_trans_mod_dquot(
qtrx->qt_icount_delta += delta;
break;
- /*
- * rtblk reservation
- */
- case XFS_TRANS_DQ_RES_RTBLKS:
+ /* rtblk reservation */
+ case XFS_TRANS_DQ_RES_RTBLKS:
qtrx->qt_rtblk_res += delta;
break;
- /*
- * rtblk count
- */
- case XFS_TRANS_DQ_RTBCOUNT:
+ /* rtblk count */
+ case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
@@ -259,14 +254,15 @@ xfs_trans_mod_dquot(
qtrx->qt_rtbcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELRTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
qtrx->qt_delrtb_delta += delta;
break;
- default:
+ default:
ASSERT(0);
}
- tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+ trace_xfs_trans_mod_dquot_after(qtrx);
}
@@ -293,6 +289,37 @@ xfs_trans_dqlockedjoin(
}
}
+/* Apply dqtrx changes to the quota reservation counters. */
+static inline void
+xfs_apply_quota_reservation_deltas(
+ struct xfs_dquot_res *res,
+ uint64_t reserved,
+ int64_t res_used,
+ int64_t count_delta)
+{
+ if (reserved != 0) {
+ /*
+ * Subtle math here: If reserved > res_used (the normal case),
+ * we're simply subtracting the unused transaction quota
+ * reservation from the dquot reservation.
+ *
+ * If, however, res_used > reserved, then we have allocated
+ * more quota blocks than were reserved for the transaction.
+ * We must add that excess to the dquot reservation since it
+ * tracks (usage + resv) and by definition we didn't reserve
+ * that excess.
+ */
+ res->reserved -= abs(reserved - res_used);
+ } else if (count_delta != 0) {
+ /*
+ * These blks were never reserved, either inside a transaction
+ * or outside one (in a delayed allocation). Also, this isn't
+ * always a negative number since we sometimes deliberately
+ * skip quota reservations.
+ */
+ res->reserved += count_delta;
+ }
+}
/*
* Called by xfs_trans_commit() and similar in spirit to
@@ -309,11 +336,10 @@ xfs_trans_apply_dquot_deltas(
int i, j;
struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
- struct xfs_disk_dquot *d;
int64_t totalbdelta;
int64_t totalrtbdelta;
- if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+ if (!tp->t_dqinfo)
return;
ASSERT(tp->t_dqinfo);
@@ -328,6 +354,8 @@ xfs_trans_apply_dquot_deltas(
xfs_trans_dqlockedjoin(tp, qa);
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ uint64_t blk_res_used;
+
qtrx = &qa[i];
/*
* The array of dquots is filled
@@ -341,7 +369,6 @@ xfs_trans_apply_dquot_deltas(
/*
* adjust the actual number of blocks used
*/
- d = &dqp->q_core;
/*
* The issue here is - sometimes we don't make a blkquota
@@ -360,38 +387,46 @@ xfs_trans_apply_dquot_deltas(
qtrx->qt_delbcnt_delta;
totalrtbdelta = qtrx->qt_rtbcount_delta +
qtrx->qt_delrtb_delta;
+
+ if (totalbdelta != 0 || totalrtbdelta != 0 ||
+ qtrx->qt_icount_delta != 0) {
+ trace_xfs_trans_apply_dquot_deltas_before(dqp);
+ trace_xfs_trans_apply_dquot_deltas(qtrx);
+ }
+
#ifdef DEBUG
if (totalbdelta < 0)
- ASSERT(be64_to_cpu(d->d_bcount) >=
- -totalbdelta);
+ ASSERT(dqp->q_blk.count >= -totalbdelta);
if (totalrtbdelta < 0)
- ASSERT(be64_to_cpu(d->d_rtbcount) >=
- -totalrtbdelta);
+ ASSERT(dqp->q_rtb.count >= -totalrtbdelta);
if (qtrx->qt_icount_delta < 0)
- ASSERT(be64_to_cpu(d->d_icount) >=
- -qtrx->qt_icount_delta);
+ ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta);
#endif
if (totalbdelta)
- be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+ dqp->q_blk.count += totalbdelta;
if (qtrx->qt_icount_delta)
- be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+ dqp->q_ino.count += qtrx->qt_icount_delta;
if (totalrtbdelta)
- be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+ dqp->q_rtb.count += totalrtbdelta;
+
+ if (totalbdelta != 0 || totalrtbdelta != 0 ||
+ qtrx->qt_icount_delta != 0)
+ trace_xfs_trans_apply_dquot_deltas_after(dqp);
/*
* Get any default limits in use.
* Start/reset the timer(s) if needed.
*/
- if (d->d_id) {
- xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
- xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+ if (dqp->q_id) {
+ xfs_qm_adjust_dqlimits(dqp);
+ xfs_qm_adjust_dqtimers(dqp);
}
- dqp->dq_flags |= XFS_DQ_DIRTY;
+ dqp->q_flags |= XFS_DQFLAG_DIRTY;
/*
* add this to the list of items to get logged
*/
@@ -401,78 +436,31 @@ xfs_trans_apply_dquot_deltas(
* In case of delayed allocations, there's no
* reservation that a transaction structure knows of.
*/
- if (qtrx->qt_blk_res != 0) {
- uint64_t blk_res_used = 0;
-
- if (qtrx->qt_bcount_delta > 0)
- blk_res_used = qtrx->qt_bcount_delta;
-
- if (qtrx->qt_blk_res != blk_res_used) {
- if (qtrx->qt_blk_res > blk_res_used)
- dqp->q_res_bcount -= (xfs_qcnt_t)
- (qtrx->qt_blk_res -
- blk_res_used);
- else
- dqp->q_res_bcount -= (xfs_qcnt_t)
- (blk_res_used -
- qtrx->qt_blk_res);
- }
- } else {
- /*
- * These blks were never reserved, either inside
- * a transaction or outside one (in a delayed
- * allocation). Also, this isn't always a
- * negative number since we sometimes
- * deliberately skip quota reservations.
- */
- if (qtrx->qt_bcount_delta) {
- dqp->q_res_bcount +=
- (xfs_qcnt_t)qtrx->qt_bcount_delta;
- }
- }
+ blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta);
+ xfs_apply_quota_reservation_deltas(&dqp->q_blk,
+ qtrx->qt_blk_res, blk_res_used,
+ qtrx->qt_bcount_delta);
+
/*
* Adjust the RT reservation.
*/
- if (qtrx->qt_rtblk_res != 0) {
- if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
- if (qtrx->qt_rtblk_res >
- qtrx->qt_rtblk_res_used)
- dqp->q_res_rtbcount -= (xfs_qcnt_t)
- (qtrx->qt_rtblk_res -
- qtrx->qt_rtblk_res_used);
- else
- dqp->q_res_rtbcount -= (xfs_qcnt_t)
- (qtrx->qt_rtblk_res_used -
- qtrx->qt_rtblk_res);
- }
- } else {
- if (qtrx->qt_rtbcount_delta)
- dqp->q_res_rtbcount +=
- (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
- }
+ xfs_apply_quota_reservation_deltas(&dqp->q_rtb,
+ qtrx->qt_rtblk_res,
+ qtrx->qt_rtblk_res_used,
+ qtrx->qt_rtbcount_delta);
/*
* Adjust the inode reservation.
*/
- if (qtrx->qt_ino_res != 0) {
- ASSERT(qtrx->qt_ino_res >=
- qtrx->qt_ino_res_used);
- if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
- dqp->q_res_icount -= (xfs_qcnt_t)
- (qtrx->qt_ino_res -
- qtrx->qt_ino_res_used);
- } else {
- if (qtrx->qt_icount_delta)
- dqp->q_res_icount +=
- (xfs_qcnt_t)qtrx->qt_icount_delta;
- }
-
- ASSERT(dqp->q_res_bcount >=
- be64_to_cpu(dqp->q_core.d_bcount));
- ASSERT(dqp->q_res_icount >=
- be64_to_cpu(dqp->q_core.d_icount));
- ASSERT(dqp->q_res_rtbcount >=
- be64_to_cpu(dqp->q_core.d_rtbcount));
+ ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+ xfs_apply_quota_reservation_deltas(&dqp->q_ino,
+ qtrx->qt_ino_res,
+ qtrx->qt_ino_res_used,
+ qtrx->qt_icount_delta);
+
+ ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
+ ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
+ ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
}
}
}
@@ -493,7 +481,7 @@ xfs_trans_unreserve_and_mod_dquots(
struct xfs_dqtrx *qtrx, *qa;
bool locked;
- if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+ if (!tp->t_dqinfo)
return;
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
@@ -516,7 +504,7 @@ xfs_trans_unreserve_and_mod_dquots(
if (qtrx->qt_blk_res) {
xfs_dqlock(dqp);
locked = true;
- dqp->q_res_bcount -=
+ dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
if (qtrx->qt_ino_res) {
@@ -524,7 +512,7 @@ xfs_trans_unreserve_and_mod_dquots(
xfs_dqlock(dqp);
locked = true;
}
- dqp->q_res_icount -=
+ dqp->q_ino.reserved -=
(xfs_qcnt_t)qtrx->qt_ino_res;
}
@@ -533,7 +521,7 @@ xfs_trans_unreserve_and_mod_dquots(
xfs_dqlock(dqp);
locked = true;
}
- dqp->q_res_rtbcount -=
+ dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
if (locked)
@@ -549,21 +537,78 @@ xfs_quota_warn(
struct xfs_dquot *dqp,
int type)
{
- enum quota_type qtype;
+ enum quota_type qtype;
- if (dqp->dq_flags & XFS_DQ_PROJ)
+ switch (xfs_dquot_type(dqp)) {
+ case XFS_DQTYPE_PROJ:
qtype = PRJQUOTA;
- else if (dqp->dq_flags & XFS_DQ_USER)
+ break;
+ case XFS_DQTYPE_USER:
qtype = USRQUOTA;
- else
+ break;
+ case XFS_DQTYPE_GROUP:
qtype = GRPQUOTA;
+ break;
+ default:
+ return;
+ }
- quota_send_warning(make_kqid(&init_user_ns, qtype,
- be32_to_cpu(dqp->q_core.d_id)),
+ quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id),
mp->m_super->s_dev, type);
}
/*
+ * Decide if we can make an additional reservation against a quota resource.
+ * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal.
+ *
+ * Note that we assume that the numeric difference between the inode and block
+ * warning codes will always be 3 since it's userspace ABI now, and will never
+ * decrease the quota reservation, so the *BELOW messages are irrelevant.
+ */
+static inline int
+xfs_dqresv_check(
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ int64_t delta,
+ bool *fatal)
+{
+ xfs_qcnt_t hardlimit = res->hardlimit;
+ xfs_qcnt_t softlimit = res->softlimit;
+ xfs_qcnt_t total_count = res->reserved + delta;
+
+ BUILD_BUG_ON(QUOTA_NL_BHARDWARN != QUOTA_NL_IHARDWARN + 3);
+ BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3);
+ BUILD_BUG_ON(QUOTA_NL_BSOFTWARN != QUOTA_NL_ISOFTWARN + 3);
+
+ *fatal = false;
+ if (delta <= 0)
+ return QUOTA_NL_NOWARN;
+
+ if (!hardlimit)
+ hardlimit = qlim->hard;
+ if (!softlimit)
+ softlimit = qlim->soft;
+
+ if (hardlimit && total_count > hardlimit) {
+ *fatal = true;
+ return QUOTA_NL_IHARDWARN;
+ }
+
+ if (softlimit && total_count > softlimit) {
+ time64_t now = ktime_get_real_seconds();
+
+ if (res->timer != 0 && now > res->timer) {
+ *fatal = true;
+ return QUOTA_NL_ISOFTLONGWARN;
+ }
+
+ return QUOTA_NL_ISOFTWARN;
+ }
+
+ return QUOTA_NL_NOWARN;
+}
+
+/*
* This reserves disk blocks and inodes against a dquot.
* Flags indicate if the dquot is to be locked here and also
* if the blk reservation is for RT or regular blocks.
@@ -578,110 +623,58 @@ xfs_trans_dqresv(
long ninos,
uint flags)
{
- xfs_qcnt_t hardlimit;
- xfs_qcnt_t softlimit;
- time64_t timer;
- xfs_qwarncnt_t warns;
- xfs_qwarncnt_t warnlimit;
- xfs_qcnt_t total_count;
- xfs_qcnt_t *resbcountp;
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_def_quota *defq;
-
+ struct xfs_dquot_res *blkres;
+ struct xfs_quota_limits *qlim;
xfs_dqlock(dqp);
- defq = xfs_get_defquota(dqp, q);
+ defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
if (flags & XFS_TRANS_DQ_RES_BLKS) {
- hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
- if (!hardlimit)
- hardlimit = defq->bhardlimit;
- softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
- if (!softlimit)
- softlimit = defq->bsoftlimit;
- timer = be32_to_cpu(dqp->q_core.d_btimer);
- warns = be16_to_cpu(dqp->q_core.d_bwarns);
- warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
- resbcountp = &dqp->q_res_bcount;
+ blkres = &dqp->q_blk;
+ qlim = &defq->blk;
} else {
- ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
- hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
- if (!hardlimit)
- hardlimit = defq->rtbhardlimit;
- softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
- if (!softlimit)
- softlimit = defq->rtbsoftlimit;
- timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
- warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
- warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
- resbcountp = &dqp->q_res_rtbcount;
+ blkres = &dqp->q_rtb;
+ qlim = &defq->rtb;
}
- if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
- dqp->q_core.d_id &&
- ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
- (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
- (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
- if (nblks > 0) {
+ if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id &&
+ xfs_dquot_is_enforced(dqp)) {
+ int quota_nl;
+ bool fatal;
+
+ /*
+ * dquot is locked already. See if we'd go over the hardlimit
+ * or exceed the timelimit if we'd reserve resources.
+ */
+ quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal);
+ if (quota_nl != QUOTA_NL_NOWARN) {
/*
- * dquot is locked already. See if we'd go over the
- * hardlimit or exceed the timelimit if we allocate
- * nblks.
+ * Quota block warning codes are 3 more than the inode
+ * codes, which we check above.
*/
- total_count = *resbcountp + nblks;
- if (hardlimit && total_count > hardlimit) {
- xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+ xfs_quota_warn(mp, dqp, quota_nl + 3);
+ if (fatal)
goto error_return;
- }
- if (softlimit && total_count > softlimit) {
- if ((timer != 0 &&
- ktime_get_real_seconds() > timer) ||
- (warns != 0 && warns >= warnlimit)) {
- xfs_quota_warn(mp, dqp,
- QUOTA_NL_BSOFTLONGWARN);
- goto error_return;
- }
-
- xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
- }
}
- if (ninos > 0) {
- total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
- timer = be32_to_cpu(dqp->q_core.d_itimer);
- warns = be16_to_cpu(dqp->q_core.d_iwarns);
- warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
- hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
- if (!hardlimit)
- hardlimit = defq->ihardlimit;
- softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
- if (!softlimit)
- softlimit = defq->isoftlimit;
-
- if (hardlimit && total_count > hardlimit) {
- xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+
+ quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos,
+ &fatal);
+ if (quota_nl != QUOTA_NL_NOWARN) {
+ xfs_quota_warn(mp, dqp, quota_nl);
+ if (fatal)
goto error_return;
- }
- if (softlimit && total_count > softlimit) {
- if ((timer != 0 &&
- ktime_get_real_seconds() > timer) ||
- (warns != 0 && warns >= warnlimit)) {
- xfs_quota_warn(mp, dqp,
- QUOTA_NL_ISOFTLONGWARN);
- goto error_return;
- }
- xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
- }
}
}
/*
* Change the reservation, but not the actual usage.
- * Note that q_res_bcount = q_core.d_bcount + resv
+ * Note that q_blk.reserved = q_blk.count + resv
*/
- (*resbcountp) += (xfs_qcnt_t)nblks;
- if (ninos != 0)
- dqp->q_res_icount += (xfs_qcnt_t)ninos;
+ blkres->reserved += (xfs_qcnt_t)nblks;
+ dqp->q_ino.reserved += (xfs_qcnt_t)ninos;
/*
* note the reservation amt in the trans struct too,
@@ -691,29 +684,29 @@ xfs_trans_dqresv(
* because we don't have the luxury of a transaction envelope then.
*/
if (tp) {
- ASSERT(tp->t_dqinfo);
ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
- if (nblks != 0)
- xfs_trans_mod_dquot(tp, dqp,
- flags & XFS_QMOPT_RESBLK_MASK,
- nblks);
- if (ninos != 0)
- xfs_trans_mod_dquot(tp, dqp,
- XFS_TRANS_DQ_RES_INOS,
- ninos);
+ xfs_trans_mod_dquot(tp, dqp, flags & XFS_QMOPT_RESBLK_MASK,
+ nblks);
+ xfs_trans_mod_dquot(tp, dqp, XFS_TRANS_DQ_RES_INOS, ninos);
}
- ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
- ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
- ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+
+ if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) ||
+ XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) ||
+ XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count))
+ goto error_corrupt;
xfs_dqunlock(dqp);
return 0;
error_return:
xfs_dqunlock(dqp);
- if (flags & XFS_QMOPT_ENOSPC)
+ if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
return -ENOSPC;
return -EDQUOT;
+error_corrupt:
+ xfs_dqunlock(dqp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
}
@@ -742,17 +735,13 @@ xfs_trans_reserve_quota_bydquots(
{
int error;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
- if (tp && tp->t_dqinfo == NULL)
- xfs_trans_alloc_dqinfo(tp);
-
ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
if (udqp) {
- error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
- (flags & ~XFS_QMOPT_ENOSPC));
+ error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
if (error)
return error;
}
@@ -795,77 +784,68 @@ int
xfs_trans_reserve_quota_nblks(
struct xfs_trans *tp,
struct xfs_inode *ip,
- int64_t nblks,
- long ninos,
- uint flags)
+ int64_t dblocks,
+ int64_t rblocks,
+ bool force)
{
struct xfs_mount *mp = ip->i_mount;
+ unsigned int qflags = 0;
+ int error;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ if (!XFS_IS_QUOTA_ON(mp))
return 0;
- if (XFS_IS_PQUOTA_ON(mp))
- flags |= XFS_QMOPT_ENOSPC;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
-
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
- XFS_TRANS_DQ_RES_RTBLKS ||
- (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
- XFS_TRANS_DQ_RES_BLKS);
- /*
- * Reserve nblks against these dquots, with trans as the mediator.
- */
- return xfs_trans_reserve_quota_bydquots(tp, mp,
- ip->i_udquot, ip->i_gdquot,
- ip->i_pdquot,
- nblks, ninos, flags);
+ if (force)
+ qflags |= XFS_QMOPT_FORCE_RES;
+
+ /* Reserve data device quota against the inode's dquots. */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, dblocks, 0,
+ XFS_QMOPT_RES_REGBLKS | qflags);
+ if (error)
+ return error;
+
+ /* Do the same but for realtime blocks. */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, rblocks, 0,
+ XFS_QMOPT_RES_RTBLKS | qflags);
+ if (error) {
+ xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, -dblocks, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ return error;
+ }
+
+ return 0;
}
-/*
- * This routine is called to allocate a quotaoff log item.
- */
-struct xfs_qoff_logitem *
-xfs_trans_get_qoff_item(
+/* Change the quota reservations for an inode creation activity. */
+int
+xfs_trans_reserve_quota_icreate(
struct xfs_trans *tp,
- struct xfs_qoff_logitem *startqoff,
- uint flags)
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ int64_t dblocks)
{
- struct xfs_qoff_logitem *q;
-
- ASSERT(tp != NULL);
-
- q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
- ASSERT(q != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &q->qql_item);
- return q;
-}
+ struct xfs_mount *mp = tp->t_mountp;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return 0;
-/*
- * This is called to mark the quotaoff logitem as needing
- * to be logged when the transaction is committed. The logitem must
- * already be associated with the given transaction.
- */
-void
-xfs_trans_log_quotaoff_item(
- struct xfs_trans *tp,
- struct xfs_qoff_logitem *qlp)
-{
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
+ return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp,
+ dblocks, 1, XFS_QMOPT_RES_REGBLKS);
}
STATIC void
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
+ tp->t_dqinfo = kmem_cache_zalloc(xfs_dqtrx_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
}
void
@@ -874,6 +854,6 @@ xfs_trans_free_dqinfo(
{
if (!tp->t_dqinfo)
return;
- kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+ kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1c4614..d5400150358e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -6,6 +6,7 @@
#ifndef __XFS_TRANS_PRIV_H__
#define __XFS_TRANS_PRIV_H__
+struct xlog;
struct xfs_log_item;
struct xfs_mount;
struct xfs_trans;
@@ -18,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
-void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+void xfs_trans_committed_bulk(struct xfs_ail *ailp,
+ struct list_head *lv_chain,
xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
@@ -50,7 +52,7 @@ struct xfs_ail_cursor {
* Eventually we need to drive the locking in here as well.
*/
struct xfs_ail {
- struct xfs_mount *ail_mount;
+ struct xlog *ail_log;
struct task_struct *ail_task;
struct list_head ail_head;
xfs_lsn_t ail_target;
@@ -91,24 +93,13 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
-void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock);
+void xfs_trans_ail_insert(struct xfs_ail *ailp, struct xfs_log_item *lip,
+ xfs_lsn_t lsn);
-static inline void
-xfs_trans_ail_remove(
- struct xfs_log_item *lip,
- int shutdown_type)
-{
- struct xfs_ail *ailp = lip->li_ailp;
-
- spin_lock(&ailp->ail_lock);
- /* xfs_trans_ail_delete() drops the AIL lock */
- if (test_bit(XFS_LI_IN_AIL, &lip->li_flags))
- xfs_trans_ail_delete(ailp, lip, shutdown_type);
- else
- spin_unlock(&ailp->ail_lock);
-}
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+ __releases(ailp->ail_lock);
+void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type);
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index b0fedb543f97..c325a28b89a8 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -9,82 +9,148 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_da_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_acl.h"
+#include "xfs_log.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-static int
-xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
- struct inode *inode, const char *name, void *value, size_t size)
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+static inline int
+xfs_attr_grab_log_assist(
+ struct xfs_mount *mp)
{
- int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(inode);
- int error, asize = size;
- size_t namelen = strlen(name);
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (!size) {
- xflags |= ATTR_KERNOVAL;
- value = NULL;
- }
+ int error = 0;
+
+ /*
+ * Protect ourselves from an idle log clearing the logged xattrs log
+ * incompat feature bit.
+ */
+ xlog_use_incompat_feat(mp->m_log);
- error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value,
- &asize, xflags);
+ /*
+ * If log-assisted xattrs are already enabled, the caller can use the
+ * log assisted swap functions with the log-incompat reference we got.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return 0;
+
+ /* Enable log-assisted xattrs. */
+ error = xfs_add_incompat_log_feature(mp,
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
if (error)
- return error;
- return asize;
+ goto drop_incompat;
+
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
+ "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
+
+ return 0;
+drop_incompat:
+ xlog_drop_incompat_feat(mp->m_log);
+ return error;
}
-void
-xfs_forget_acl(
- struct inode *inode,
- const char *name,
- int xflags)
+static inline void
+xfs_attr_rele_log_assist(
+ struct xfs_mount *mp)
{
- /*
- * Invalidate any cached ACLs if the user has bypassed the ACL
- * interface. We don't validate the content whatsoever so it is caller
- * responsibility to provide data in valid format and ensure i_mode is
- * consistent.
- */
- if (xflags & ATTR_ROOT) {
-#ifdef CONFIG_XFS_POSIX_ACL
- if (!strcmp(name, SGI_ACL_FILE))
- forget_cached_acl(inode, ACL_TYPE_ACCESS);
- else if (!strcmp(name, SGI_ACL_DEFAULT))
- forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+ xlog_drop_incompat_feat(mp->m_log);
+}
+
+static inline bool
+xfs_attr_want_log_assist(
+ struct xfs_mount *mp)
+{
+#ifdef DEBUG
+ /* Logged xattrs require a V5 super for log_incompat */
+ return xfs_has_crc(mp) && xfs_globals.larp;
+#else
+ return false;
#endif
+}
+
+/*
+ * Set or remove an xattr, having grabbed the appropriate logging resources
+ * prior to calling libxfs.
+ */
+int
+xfs_attr_change(
+ struct xfs_da_args *args)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ bool use_logging = false;
+ int error;
+
+ ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+
+ if (xfs_attr_want_log_assist(mp)) {
+ error = xfs_attr_grab_log_assist(mp);
+ if (error)
+ return error;
+
+ args->op_flags |= XFS_DA_OP_LOGGED;
+ use_logging = true;
}
+
+ error = xfs_attr_set(args);
+
+ if (use_logging)
+ xfs_attr_rele_log_assist(mp);
+ return error;
+}
+
+
+static int
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
+ struct inode *inode, const char *name, void *value, size_t size)
+{
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = handler->flags,
+ .name = name,
+ .namelen = strlen(name),
+ .value = value,
+ .valuelen = size,
+ };
+ int error;
+
+ error = xfs_attr_get(&args);
+ if (error)
+ return error;
+ return args.valuelen;
}
static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
- struct inode *inode, const char *name, const void *value,
- size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns, struct dentry *unused,
+ struct inode *inode, const char *name, const void *value,
+ size_t size, int flags)
{
- int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = handler->flags,
+ .attr_flags = flags,
+ .name = name,
+ .namelen = strlen(name),
+ .value = (void *)value,
+ .valuelen = size,
+ };
int error;
- size_t namelen = strlen(name);
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (flags & XATTR_CREATE)
- xflags |= ATTR_CREATE;
- if (flags & XATTR_REPLACE)
- xflags |= ATTR_REPLACE;
-
- if (value)
- error = xfs_attr_set(ip, name, namelen, (void *)value, size,
- xflags);
- else
- error = xfs_attr_remove(ip, name, namelen, xflags);
- if (!error)
- xfs_forget_acl(inode, name, xflags);
+ error = xfs_attr_change(&args);
+ if (!error && (handler->flags & XFS_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
return error;
}
@@ -97,14 +163,14 @@ static const struct xattr_handler xfs_xattr_user_handler = {
static const struct xattr_handler xfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .flags = ATTR_ROOT,
+ .flags = XFS_ATTR_ROOT,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
static const struct xattr_handler xfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .flags = ATTR_SECURE,
+ .flags = XFS_ATTR_SECURE,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
@@ -134,7 +200,7 @@ __xfs_xattr_put_listent(
if (context->count < 0 || context->seen_enough)
return;
- if (!context->alist)
+ if (!context->buffer)
goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
@@ -143,7 +209,7 @@ __xfs_xattr_put_listent(
context->seen_enough = 1;
return;
}
- offset = (char *)context->alist + context->count;
+ offset = context->buffer + context->count;
strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
@@ -218,7 +284,6 @@ xfs_vn_listxattr(
size_t size)
{
struct xfs_attr_list_context context;
- struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
int error;
@@ -227,14 +292,13 @@ xfs_vn_listxattr(
*/
memset(&context, 0, sizeof(context));
context.dp = XFS_I(inode);
- context.cursor = &cursor;
context.resynch = 1;
- context.alist = size ? data : NULL;
+ context.buffer = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
context.put_listent = xfs_xattr_put_listent;
- error = xfs_attr_list_int(&context);
+ error = xfs_attr_list(&context);
if (error)
return error;
if (context.count < 0)
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
new file mode 100644
index 000000000000..2b09133b1b9b
--- /dev/null
+++ b/fs/xfs/xfs_xattr.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_XATTR_H__
+#define __XFS_XATTR_H__
+
+int xfs_attr_change(struct xfs_da_args *args);
+
+extern const struct xattr_handler *xfs_xattr_handlers[];
+
+#endif /* __XFS_XATTR_H__ */