aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig17
-rw-r--r--fs/xfs/Makefile29
-rw-r--r--fs/xfs/kmem.h5
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c13
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c58
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2093
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h67
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c250
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h22
-rw-r--r--fs/xfs/libxfs/xfs_btree.c259
-rw-r--r--fs/xfs/libxfs/xfs_btree.h32
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h1
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c22
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c24
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h17
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h106
-rw-r--r--fs/xfs/libxfs/xfs_format.h37
-rw-r--r--fs/xfs/libxfs/xfs_fs.h77
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c95
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c1043
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c1333
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h138
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h51
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c1
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c1
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c13
-rw-r--r--fs/xfs/libxfs/xfs_types.h22
-rw-r--r--fs/xfs/scrub/agheader.c658
-rw-r--r--fs/xfs/scrub/alloc.c102
-rw-r--r--fs/xfs/scrub/attr.c471
-rw-r--r--fs/xfs/scrub/bmap.c363
-rw-r--r--fs/xfs/scrub/btree.c516
-rw-r--r--fs/xfs/scrub/btree.h57
-rw-r--r--fs/xfs/scrub/common.c574
-rw-r--r--fs/xfs/scrub/common.h144
-rw-r--r--fs/xfs/scrub/dabtree.c591
-rw-r--r--fs/xfs/scrub/dabtree.h59
-rw-r--r--fs/xfs/scrub/dir.c816
-rw-r--r--fs/xfs/scrub/ialloc.c337
-rw-r--r--fs/xfs/scrub/inode.c611
-rw-r--r--fs/xfs/scrub/parent.c317
-rw-r--r--fs/xfs/scrub/quota.c304
-rw-r--r--fs/xfs/scrub/refcount.c99
-rw-r--r--fs/xfs/scrub/rmap.c138
-rw-r--r--fs/xfs/scrub/rtbitmap.c108
-rw-r--r--fs/xfs/scrub/scrub.c392
-rw-r--r--fs/xfs/scrub/scrub.h115
-rw-r--r--fs/xfs/scrub/symlink.c92
-rw-r--r--fs/xfs/scrub/trace.c59
-rw-r--r--fs/xfs/scrub/trace.h499
-rw-r--r--fs/xfs/scrub/xfs_scrub.h29
-rw-r--r--fs/xfs/xfs.h1
-rw-r--r--fs/xfs/xfs_acl.c22
-rw-r--r--fs/xfs/xfs_aops.c50
-rw-r--r--fs/xfs/xfs_attr.h5
-rw-r--r--fs/xfs/xfs_attr_inactive.c71
-rw-r--r--fs/xfs/xfs_attr_list.c161
-rw-r--r--fs/xfs/xfs_bmap_util.c774
-rw-r--r--fs/xfs/xfs_bmap_util.h23
-rw-r--r--fs/xfs/xfs_buf.c18
-rw-r--r--fs/xfs/xfs_buf.h5
-rw-r--r--fs/xfs/xfs_dir2_readdir.c10
-rw-r--r--fs/xfs/xfs_discard.h1
-rw-r--r--fs/xfs/xfs_dquot.c21
-rw-r--r--fs/xfs/xfs_error.c8
-rw-r--r--fs/xfs/xfs_error.h81
-rw-r--r--fs/xfs/xfs_file.c110
-rw-r--r--fs/xfs/xfs_fsmap.c58
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c42
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c104
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_ioctl.c161
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_ioctl32.c1
-rw-r--r--fs/xfs/xfs_iomap.c33
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c52
-rw-r--r--fs/xfs/xfs_itable.c13
-rw-r--r--fs/xfs/xfs_itable.h2
-rw-r--r--fs/xfs/xfs_linux.h21
-rw-r--r--fs/xfs/xfs_log.c35
-rw-r--r--fs/xfs/xfs_log_priv.h4
-rw-r--r--fs/xfs/xfs_log_recover.c62
-rw-r--r--fs/xfs/xfs_message.h1
-rw-r--r--fs/xfs/xfs_mount.c17
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_pnfs.c3
-rw-r--r--fs/xfs/xfs_pnfs.h1
-rw-r--r--fs/xfs/xfs_reflink.c103
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--fs/xfs/xfs_trace.h66
-rw-r--r--fs/xfs/xfs_trans_ail.c22
99 files changed, 11290 insertions, 4197 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 1b98cfa342ab..f42fcf1b5465 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -71,6 +71,23 @@ config XFS_RT
If unsure, say N.
+config XFS_ONLINE_SCRUB
+ bool "XFS online metadata check support"
+ default n
+ depends on XFS_FS
+ help
+ If you say Y here you will be able to check metadata on a
+ mounted XFS filesystem. This feature is intended to reduce
+ filesystem downtime by supplementing xfs_repair. The key
+ advantage here is to look for problems proactively so that
+ they can be dealt with in a controlled manner.
+
+ This feature is considered EXPERIMENTAL. Use with caution!
+
+ See the xfs_scrub man page in section 8 for additional information.
+
+ If unsure, say N.
+
config XFS_WARN
bool "XFS Verbose Warnings"
depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a6e955bfead8..7ceb41a9786a 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_dquot_buf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
+ xfs_iext_tree.o \
xfs_inode_fork.o \
xfs_inode_buf.o \
xfs_log_rlimit.o \
@@ -135,3 +136,31 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
+
+# online scrub/repair
+ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
+
+# Tracepoints like to blow up, so build that before everything else
+
+xfs-y += $(addprefix scrub/, \
+ trace.o \
+ agheader.o \
+ alloc.o \
+ attr.o \
+ bmap.o \
+ btree.o \
+ common.o \
+ dabtree.o \
+ dir.o \
+ ialloc.o \
+ inode.o \
+ parent.o \
+ refcount.o \
+ rmap.o \
+ scrub.o \
+ symlink.o \
+ )
+
+xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o
+xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+endif
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 4d85992d75b2..4b87472f35bc 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name)
}
static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
void (*construct)(void *))
{
return kmem_cache_create(zone_name, size, 0, flags, construct);
@@ -119,8 +119,7 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr)
static inline void
kmem_zone_destroy(kmem_zone_t *zone)
{
- if (zone)
- kmem_cache_destroy(zone);
+ kmem_cache_destroy(zone);
}
extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index b008ff3250eb..2291f4224e24 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -27,6 +27,7 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_alloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
@@ -156,7 +157,8 @@ __xfs_ag_resv_free(
trace_xfs_ag_resv_free(pag, type, 0);
resv = xfs_perag_resv(pag, type);
- pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ if (pag->pag_agno == 0)
+ pag->pag_mount->m_ag_max_usable += resv->ar_asked;
/*
* AGFL blocks are always considered "free", so whatever
* was reserved at mount time must be given back at umount.
@@ -216,7 +218,14 @@ __xfs_ag_resv_init(
return error;
}
- mp->m_ag_max_usable -= ask;
+ /*
+ * Reduce the maximum per-AG allocation length by however much we're
+ * trying to reserve for an AG. Since this is a filesystem-wide
+ * counter, we only make the adjustment for AG 0. This assumes that
+ * there aren't any AGs hungrier for per-AG reservation than AG 0.
+ */
+ if (pag->pag_agno == 0)
+ mp->m_ag_max_usable -= ask;
resv = xfs_perag_resv(pag, type);
resv->ar_asked = ask;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 744dcaec34cc..0da80019a917 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -31,6 +31,7 @@
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
@@ -1584,6 +1585,10 @@ xfs_alloc_ag_vextent_small(
bp = xfs_btree_get_bufs(args->mp, args->tp,
args->agno, fbno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
xfs_trans_binval(args->tp, bp);
}
args->len = 1;
@@ -2141,6 +2146,10 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto out_agbp_relse;
+ }
xfs_trans_binval(tp, bp);
}
@@ -2923,3 +2932,52 @@ xfs_alloc_query_all(
query.fn = fn;
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
}
+
+/* Find the size of the AG, in blocks. */
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ ASSERT(agno < mp->m_sb.sb_agcount);
+
+ if (agno < mp->m_sb.sb_agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Verify that an AG block number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+bool
+xfs_verify_agbno(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno)
+{
+ xfs_agblock_t eoag;
+
+ eoag = xfs_ag_block_count(mp, agno);
+ if (agbno >= eoag)
+ return false;
+ if (agbno <= XFS_AGFL_BLOCK(mp))
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an FS block number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+bool
+xfs_verify_fsbno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno)
+{
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ef26edc2e938..7ba2d129d504 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -232,5 +232,9 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur,
xfs_alloc_query_range_fn fn, void *priv);
int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
void *priv);
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno);
+bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5c16db86b38f..53cc8b986eac 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -397,13 +397,9 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
/* rounded down */
offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
- switch (dp->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
+ if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) {
minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
return (offset >= minforkoff) ? minforkoff : 0;
- case XFS_DINODE_FMT_UUID:
- minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
- return (offset >= minforkoff) ? minforkoff : 0;
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 459f4b4f08fe..08df809e2315 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -38,6 +38,7 @@
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_rtalloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
@@ -49,7 +50,6 @@
#include "xfs_rmap.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
-#include "xfs_rmap_btree.h"
#include "xfs_icache.h"
@@ -113,28 +113,21 @@ xfs_bmap_compute_maxlevels(
STATIC int /* error */
xfs_bmbt_lookup_eq(
struct xfs_btree_cur *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
+ struct xfs_bmbt_irec *irec,
int *stat) /* success/failure */
{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
+ cur->bc_rec.b = *irec;
return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
}
STATIC int /* error */
-xfs_bmbt_lookup_ge(
+xfs_bmbt_lookup_first(
struct xfs_btree_cur *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
int *stat) /* success/failure */
{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
+ cur->bc_rec.b.br_startoff = 0;
+ cur->bc_rec.b.br_startblock = 0;
+ cur->bc_rec.b.br_blockcount = 0;
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
@@ -161,21 +154,17 @@ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
}
/*
- * Update the record referred to by cur to the value given
- * by [off, bno, len, state].
+ * Update the record referred to by cur to the value given by irec
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
STATIC int
xfs_bmbt_update(
struct xfs_btree_cur *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- xfs_exntst_t state)
+ struct xfs_bmbt_irec *irec)
{
union xfs_btree_rec rec;
- xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+ xfs_bmbt_disk_set_all(&rec.bmbt, irec);
return xfs_btree_update(cur, &rec);
}
@@ -192,12 +181,8 @@ xfs_bmap_worst_indlen(
int maxrecs; /* maximum record count at this level */
xfs_mount_t *mp; /* mount structure */
xfs_filblks_t rval; /* return value */
- xfs_filblks_t orig_len;
mp = ip->i_mount;
-
- /* Calculate the worst-case size of the bmbt. */
- orig_len = len;
maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
@@ -205,20 +190,12 @@ xfs_bmap_worst_indlen(
len += maxrecs - 1;
do_div(len, maxrecs);
rval += len;
- if (len == 1) {
- rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
level - 1;
- break;
- }
if (level == 0)
maxrecs = mp->m_bmap_dmxr[1];
}
-
- /* Calculate the worst-case size of the rmapbt. */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) +
- mp->m_rmap_maxlevels;
-
return rval;
}
@@ -255,7 +232,6 @@ xfs_bmap_forkoff_reset(
{
if (whichfork == XFS_ATTR_FORK &&
ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
- ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
@@ -512,31 +488,6 @@ error_norelse:
}
/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork, /* data or attr or cow fork */
- unsigned long caller_ip)
-{
- xfs_extnum_t idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int state = 0;
-
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
- else if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == xfs_iext_count(ifp));
- for (idx = 0; idx < cnt; idx++)
- trace_xfs_extlist(ip, idx, state, caller_ip);
-}
-
-/*
* Validate that the bmbt_irecs being returned from bmapi are valid
* given the caller's original parameters. Specifically check the
* ranges of the returned irecs to ensure that they only extend beyond
@@ -670,8 +621,8 @@ xfs_bmap_btree_to_extents(
cbno = be64_to_cpu(*pp);
*logflagsp = 0;
#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
- return error;
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+ xfs_btree_check_lptr(cur, cbno, 1));
#endif
error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
@@ -716,14 +667,14 @@ xfs_bmap_extents_to_btree(
xfs_bmbt_rec_t *arp; /* child record pointer */
struct xfs_btree_block *block; /* btree root block */
xfs_btree_cur_t *cur; /* bmap btree cursor */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
int error; /* error return value */
- xfs_extnum_t i, cnt; /* extent record index */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_key_t *kp; /* root block key pointer */
xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* number of file extents */
xfs_bmbt_ptr_t *pp; /* root block address pointer */
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec rec;
+ xfs_extnum_t cnt = 0;
mp = ip->i_mount;
ASSERT(whichfork != XFS_COW_FORK);
@@ -802,15 +753,12 @@ xfs_bmap_extents_to_btree(
XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
XFS_BTREE_LONG_PTRS);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- nextents = xfs_iext_count(ifp);
- for (cnt = i = 0; i < nextents; i++) {
- ep = xfs_iext_get_ext(ifp, i);
- if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
- arp->l0 = cpu_to_be64(ep->l0);
- arp->l1 = cpu_to_be64(ep->l1);
- arp++; cnt++;
- }
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
+ continue;
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+ xfs_bmbt_disk_set_all(arp, &rec);
+ cnt++;
}
ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
xfs_btree_set_numrecs(ablock, cnt);
@@ -858,6 +806,8 @@ xfs_bmap_local_to_extents_empty(
xfs_bmap_forkoff_reset(ip, whichfork);
ifp->if_flags &= ~XFS_IFINLINE;
ifp->if_flags |= XFS_IFEXTENTS;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
}
@@ -881,6 +831,7 @@ xfs_bmap_local_to_extents(
xfs_alloc_arg_t args; /* allocation arguments */
xfs_buf_t *bp; /* buffer for extent block */
struct xfs_bmbt_irec rec;
+ struct xfs_iext_cursor icur;
/*
* We don't want to deal with the case of keeping inode data inline yet.
@@ -898,8 +849,7 @@ xfs_bmap_local_to_extents(
flags = 0;
error = 0;
- ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
- XFS_IFINLINE);
+ ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS)) == XFS_IFINLINE);
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
@@ -943,15 +893,16 @@ xfs_bmap_local_to_extents(
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
+
rec.br_startoff = 0;
rec.br_startblock = args.fsbno;
rec.br_blockcount = 1;
rec.br_state = XFS_EXT_NORM;
- xfs_iext_insert(ip, 0, 1, &rec, 0);
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_insert(ip, &icur, &rec, 0);
- trace_xfs_bmap_post_update(ip, 0,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
- _THIS_IP_);
XFS_IFORK_NEXT_SET(ip, whichfork, 1);
ip->i_d.di_nblocks = 1;
xfs_trans_mod_dquot_byino(tp, ip,
@@ -986,7 +937,8 @@ xfs_bmap_add_attrfork_btree(
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
cur->bc_private.b.dfops = dfops;
cur->bc_private.b.firstblock = *firstblock;
- if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+ error = xfs_bmbt_lookup_first(cur, &stat);
+ if (error)
goto error0;
/* must be at least one entry */
XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
@@ -1137,9 +1089,6 @@ xfs_bmap_add_attrfork(
case XFS_DINODE_FMT_DEV:
ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
break;
- case XFS_DINODE_FMT_UUID:
- ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
- break;
case XFS_DINODE_FMT_LOCAL:
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -1219,32 +1168,35 @@ trans_cancel:
*/
/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
+ * Read in extents from a btree-format inode.
*/
-int /* error */
-xfs_bmap_read_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
+int
+xfs_iread_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
{
- struct xfs_btree_block *block; /* current btree block */
- xfs_fsblock_t bno; /* block # of "block" */
- xfs_buf_t *bp; /* buffer for "block" */
- int error; /* error return value */
- xfs_extnum_t i, j; /* index into the extents list */
- xfs_ifork_t *ifp; /* fork structure */
- int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- /* REFERENCED */
- xfs_extnum_t room; /* number of entries there's room for */
+ struct xfs_mount *mp = ip->i_mount;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+ struct xfs_btree_block *block = ifp->if_broot;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec new;
+ xfs_fsblock_t bno;
+ struct xfs_buf *bp;
+ xfs_extnum_t i, j;
+ int level;
+ __be64 *pp;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
@@ -1261,21 +1213,23 @@ xfs_bmap_read_extents(
error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
- return error;
+ goto out;
block = XFS_BUF_TO_BLOCK(bp);
if (level == 0)
break;
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
XFS_WANT_CORRUPTED_GOTO(mp,
- XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ XFS_FSB_SANITY_CHECK(mp, bno), out_brelse);
xfs_trans_brelse(tp, bp);
}
+
/*
* Here with bp and block set to the leftmost leaf node in the tree.
*/
- room = xfs_iext_count(ifp);
i = 0;
+ xfs_iext_first(ifp, &icur);
+
/*
* Loop over all leaf nodes. Copy information to the extent records.
*/
@@ -1285,14 +1239,15 @@ xfs_bmap_read_extents(
xfs_extnum_t num_recs;
num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(i + num_recs > room)) {
- ASSERT(i + num_recs <= room);
+ if (unlikely(i + num_recs > nextents)) {
+ ASSERT(i + num_recs <= nextents);
xfs_warn(ip->i_mount,
"corrupt dinode %Lu, (btree extents).",
(unsigned long long) ip->i_ino);
- XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+ XFS_CORRUPTION_ERROR(__func__,
XFS_ERRLEVEL_LOW, ip->i_mount, block);
- goto error0;
+ error = -EFSCORRUPTED;
+ goto out_brelse;
}
/*
* Read-ahead the next leaf block, if any.
@@ -1305,15 +1260,17 @@ xfs_bmap_read_extents(
* Copy records into the extent records.
*/
frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- for (j = 0; j < num_recs; j++, i++, frp++) {
- xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
- trp->l0 = be64_to_cpu(frp->l0);
- trp->l1 = be64_to_cpu(frp->l1);
- if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) {
+ for (j = 0; j < num_recs; j++, frp++, i++) {
+ xfs_bmbt_disk_get_all(frp, &new);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) {
XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
XFS_ERRLEVEL_LOW, mp);
- goto error0;
+ error = -EFSCORRUPTED;
+ goto out_brelse;
}
+ xfs_iext_insert(ip, &icur, &new, state);
+ trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
+ xfs_iext_next(ifp, &icur);
}
xfs_trans_brelse(tp, bp);
bno = nextbno;
@@ -1325,71 +1282,74 @@ xfs_bmap_read_extents(
error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
- return error;
+ goto out;
block = XFS_BUF_TO_BLOCK(bp);
}
- if (i != XFS_IFORK_NEXTENTS(ip, whichfork))
- return -EFSCORRUPTED;
+
+ if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
ASSERT(i == xfs_iext_count(ifp));
- XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+
+ ifp->if_flags |= XFS_IFEXTENTS;
return 0;
-error0:
+
+out_brelse:
xfs_trans_brelse(tp, bp);
- return -EFSCORRUPTED;
+out:
+ xfs_iext_destroy(ifp);
+ return error;
}
/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
+ * Returns the relative block number of the first unused block(s) in the given
+ * fork with at least "len" logically contiguous blocks free. This is the
+ * lowest-address hole if the fork has holes, else the first block past the end
+ * of fork. Return 0 if the fork is currently local (in-inode).
*/
int /* error */
xfs_bmap_first_unused(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_extlen_t len, /* size of hole to find */
- xfs_fileoff_t *first_unused, /* unused block */
- int whichfork) /* data or attr fork */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
{
- int error; /* error return value */
- int idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_fileoff_t lastaddr; /* last block number seen */
- xfs_fileoff_t lowest; /* lowest useful block */
- xfs_fileoff_t max; /* starting useful block */
- xfs_extnum_t nextents; /* number of extent entries */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t lastaddr = 0;
+ xfs_fileoff_t lowest, max;
+ int error;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
*first_unused = 0;
return 0;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- lowest = *first_unused;
- nextents = xfs_iext_count(ifp);
- for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
- struct xfs_bmbt_irec got;
- xfs_iext_get_extent(ifp, idx, &got);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+ lowest = max = *first_unused;
+ for_each_xfs_iext(ifp, &icur, &got) {
/*
* See if the hole before this extent will work.
*/
if (got.br_startoff >= lowest + len &&
- got.br_startoff - max >= len) {
- *first_unused = max;
- return 0;
- }
+ got.br_startoff - max >= len)
+ break;
lastaddr = got.br_startoff + got.br_blockcount;
max = XFS_FILEOFF_MAX(lastaddr, lowest);
}
+
*first_unused = max;
return 0;
}
@@ -1409,7 +1369,7 @@ xfs_bmap_last_before(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int error;
switch (XFS_IFORK_FORMAT(ip, whichfork)) {
@@ -1429,17 +1389,8 @@ xfs_bmap_last_before(
return error;
}
- if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) {
- if (got.br_startoff <= *last_block - 1)
- return 0;
- }
-
- if (xfs_iext_get_extent(ifp, idx - 1, &got)) {
- *last_block = got.br_startoff + got.br_blockcount;
- return 0;
- }
-
- *last_block = 0;
+ if (!xfs_iext_lookup_extent_before(ip, ifp, last_block, &icur, &got))
+ *last_block = 0;
return 0;
}
@@ -1452,8 +1403,8 @@ xfs_bmap_last_extent(
int *is_empty)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_iext_cursor icur;
int error;
- int nextents;
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
@@ -1461,14 +1412,11 @@ xfs_bmap_last_extent(
return error;
}
- nextents = xfs_iext_count(ifp);
- if (nextents == 0) {
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, rec))
*is_empty = 1;
- return 0;
- }
-
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
- *is_empty = 0;
+ else
+ *is_empty = 0;
return 0;
}
@@ -1490,14 +1438,14 @@ xfs_bmap_isaeof(
int is_empty;
int error;
- bma->aeof = 0;
+ bma->aeof = false;
error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
&is_empty);
if (error)
return error;
if (is_empty) {
- bma->aeof = 1;
+ bma->aeof = true;
return 0;
}
@@ -1553,10 +1501,10 @@ xfs_bmap_one_block(
xfs_inode_t *ip, /* incore inode */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
xfs_ifork_t *ifp; /* inode fork pointer */
int rval; /* return value */
xfs_bmbt_irec_t s; /* internal version of extent */
+ struct xfs_iext_cursor icur;
#ifndef DEBUG
if (whichfork == XFS_DATA_FORK)
@@ -1568,8 +1516,8 @@ xfs_bmap_one_block(
return 0;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_get_all(ep, &s);
+ xfs_iext_first(ifp, &icur);
+ xfs_iext_get_extent(ifp, &icur, &s);
rval = s.br_startoff == 0 && s.br_blockcount == 1;
if (rval && whichfork == XFS_DATA_FORK)
ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
@@ -1589,8 +1537,6 @@ xfs_bmap_add_extent_delay_real(
int whichfork)
{
struct xfs_bmbt_irec *new = &bma->got;
- int diff; /* temp value */
- xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1598,14 +1544,14 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = 0;/* state bits, accessed thru macros */
+ int state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
- xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
struct xfs_mount *mp;
xfs_extnum_t *nextents;
+ struct xfs_bmbt_irec old;
mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
@@ -1613,8 +1559,6 @@ xfs_bmap_add_extent_delay_real(
nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
&bma->ip->i_d.di_nextents);
- ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -1625,15 +1569,12 @@ xfs_bmap_add_extent_delay_real(
#define RIGHT r[1]
#define PREV r[2]
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
/*
* Set up a bunch of variables to make the tests simpler.
*/
- ep = xfs_iext_get_ext(ifp, bma->idx);
- xfs_bmbt_get_all(ep, &PREV);
+ xfs_iext_get_extent(ifp, &bma->icur, &PREV);
new_endoff = new->br_startoff + new->br_blockcount;
+ ASSERT(isnullstartblock(PREV.br_startblock));
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
@@ -1653,10 +1594,8 @@ xfs_bmap_add_extent_delay_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (bma->idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, &bma->icur, &LEFT)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
-
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -1673,10 +1612,8 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < xfs_iext_count(ifp) - 1) {
+ if (xfs_iext_peek_next_extent(ifp, &bma->icur, &RIGHT)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
-
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1706,22 +1643,19 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left and right neighbors are both contiguous with new.
*/
- bma->idx--;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
- LEFT.br_blockcount + PREV.br_blockcount +
- RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
- xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
+ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
+
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
(*nextents)--;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -1733,11 +1667,7 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount +
- PREV.br_blockcount +
- RIGHT.br_blockcount, LEFT.br_state);
+ error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
}
@@ -1748,28 +1678,22 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left neighbor is contiguous, the right is not.
*/
- bma->idx--;
+ old = LEFT;
+ LEFT.br_blockcount += PREV.br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
- LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock, LEFT.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount +
- PREV.br_blockcount, LEFT.br_state);
+ error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
}
@@ -1780,27 +1704,23 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, new->br_startblock);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount + RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ PREV.br_startblock = new->br_startblock;
+ PREV.br_blockcount += RIGHT.br_blockcount;
+
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_remove(bma->ip, &bma->icur, state);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
- new->br_startblock,
- PREV.br_blockcount +
- RIGHT.br_blockcount, PREV.br_state);
+ error = xfs_bmbt_update(bma->cur, &PREV);
if (error)
goto done;
}
@@ -1812,23 +1732,19 @@ xfs_bmap_add_extent_delay_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, new->br_startblock);
- xfs_bmbt_set_state(ep, new->br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ PREV.br_startblock = new->br_startblock;
+ PREV.br_state = new->br_state;
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
(*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -1841,40 +1757,33 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
- LEFT.br_blockcount + new->br_blockcount);
- xfs_bmbt_set_startoff(ep,
- PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-
+ old = LEFT;
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
+ startblockval(PREV.br_startblock));
+
+ LEFT.br_blockcount += new->br_blockcount;
+
+ PREV.br_blockcount = temp;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock = nullstartblock(da_new);
+
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ xfs_iext_prev(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
+
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock, LEFT.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount +
- new->br_blockcount,
- LEFT.br_state);
+ error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
}
- da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
- startblockval(PREV.br_startblock));
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-
- bma->idx--;
break;
case BMAP_LEFT_FILLING:
@@ -1882,23 +1791,16 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, new_endoff);
- temp = PREV.br_blockcount - new->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
(*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -1913,12 +1815,18 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+
+ temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
(bma->cur ? bma->cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, bma->idx + 1);
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+
+ PREV.br_startoff = new_endoff;
+ PREV.br_blockcount = temp;
+ PREV.br_startblock = nullstartblock(da_new);
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+ xfs_iext_prev(ifp, &bma->icur);
break;
case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1926,40 +1834,34 @@ xfs_bmap_add_extent_delay_real(
* Filling in the last part of a previous delayed allocation.
* The right neighbor is contiguous with the new allocation.
*/
- temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
- new->br_startoff, new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount,
- RIGHT.br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
+ old = RIGHT;
+ RIGHT.br_startoff = new->br_startoff;
+ RIGHT.br_startblock = new->br_startblock;
+ RIGHT.br_blockcount += new->br_blockcount;
+
if (bma->cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount +
- RIGHT.br_blockcount,
- RIGHT.br_state);
+ error = xfs_bmbt_update(bma->cur, &RIGHT);
if (error)
goto done;
}
+ temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock));
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- bma->idx++;
+ PREV.br_blockcount = temp;
+ PREV.br_startblock = nullstartblock(da_new);
+
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
break;
case BMAP_RIGHT_FILLING:
@@ -1967,22 +1869,16 @@ xfs_bmap_add_extent_delay_real(
* Filling in the last part of a previous delayed allocation.
* The right neighbor is not contiguous.
*/
- temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
(*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -1997,14 +1893,16 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
+
+ temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
(bma->cur ? bma->cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, bma->idx);
- xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- bma->idx++;
+ PREV.br_startblock = nullstartblock(da_new);
+ PREV.br_blockcount = temp;
+ xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
+ xfs_iext_next(ifp, &bma->icur);
break;
case 0:
@@ -2028,30 +1926,40 @@ xfs_bmap_add_extent_delay_real(
* PREV @ idx LEFT RIGHT
* inserted at idx + 1
*/
- temp = new->br_startoff - PREV.br_startoff;
- temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
+ old = PREV;
+
+ /* LEFT is the new middle */
LEFT = *new;
+
+ /* RIGHT is the new right */
RIGHT.br_state = PREV.br_state;
- RIGHT.br_startblock = nullstartblock(
- (int)xfs_bmap_worst_indlen(bma->ip, temp2));
RIGHT.br_startoff = new_endoff;
- RIGHT.br_blockcount = temp2;
- /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
- xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
+ RIGHT.br_blockcount =
+ PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ RIGHT.br_startblock =
+ nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+ RIGHT.br_blockcount));
+
+ /* truncate PREV */
+ PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
+ PREV.br_startblock =
+ nullstartblock(xfs_bmap_worst_indlen(bma->ip,
+ PREV.br_blockcount));
+ xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+
+ xfs_iext_next(ifp, &bma->icur);
+ xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state);
+ xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state);
(*nextents)++;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
@@ -2066,30 +1974,9 @@ xfs_bmap_add_extent_delay_real(
if (error)
goto done;
}
- temp = xfs_bmap_worst_indlen(bma->ip, temp);
- temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
- diff = (int)(temp + temp2 -
- (startblockval(PREV.br_startblock) -
- (bma->cur ?
- bma->cur->bc_private.b.allocated : 0)));
- if (diff > 0) {
- error = xfs_mod_fdblocks(bma->ip->i_mount,
- -((int64_t)diff), false);
- ASSERT(!error);
- if (error)
- goto done;
- }
- ep = xfs_iext_get_ext(ifp, bma->idx);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
- nullstartblock((int)temp2));
- trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-
- bma->idx++;
- da_new = temp + temp2;
+ da_new = startblockval(PREV.br_startblock) +
+ startblockval(RIGHT.br_startblock);
break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -2123,19 +2010,17 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
- /* adjust for changes in reserved delayed indirect blocks */
- if (da_old || da_new) {
- temp = da_new;
- if (bma->cur)
- temp += bma->cur->bc_private.b.allocated;
- if (temp < da_old)
- xfs_mod_fdblocks(bma->ip->i_mount,
- (int64_t)(da_old - temp), false);
+ if (bma->cur) {
+ da_new += bma->cur->bc_private.b.allocated;
+ bma->cur->bc_private.b.allocated = 0;
}
- /* clear out the allocated field, done with it now in any case. */
- if (bma->cur)
- bma->cur->bc_private.b.allocated = 0;
+ /* adjust for changes in reserved delayed indirect blocks */
+ if (da_new != da_old) {
+ ASSERT(state == 0 || da_new < da_old);
+ error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
+ false);
+ }
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
@@ -2155,7 +2040,7 @@ xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
- xfs_extnum_t *idx, /* extent number to update/insert */
+ struct xfs_iext_cursor *icur,
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_fsblock_t *first, /* pointer to firstblock variable */
@@ -2163,28 +2048,22 @@ xfs_bmap_add_extent_unwritten_real(
int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
- xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_fileoff_t new_endoff; /* end offset of new entry */
- xfs_exntst_t newext; /* new extent state */
- xfs_exntst_t oldext; /* old extent state */
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = 0;/* state bits, accessed thru macros */
+ int state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec old;
*logflagsp = 0;
cur = *curp;
ifp = XFS_IFORK_PTR(ip, whichfork);
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2197,12 +2076,8 @@ xfs_bmap_add_extent_unwritten_real(
* Set up a bunch of variables to make the tests simpler.
*/
error = 0;
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_get_all(ep, &PREV);
- newext = new->br_state;
- oldext = (newext == XFS_EXT_UNWRITTEN) ?
- XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
- ASSERT(PREV.br_state == oldext);
+ xfs_iext_get_extent(ifp, icur, &PREV);
+ ASSERT(new->br_state != PREV.br_state);
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
@@ -2220,10 +2095,8 @@ xfs_bmap_add_extent_unwritten_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (*idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, icur, &LEFT)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
-
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2231,7 +2104,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
- LEFT.br_state == newext &&
+ LEFT.br_state == new->br_state &&
LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
state |= BMAP_LEFT_CONTIG;
@@ -2240,9 +2113,8 @@ xfs_bmap_add_extent_unwritten_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (*idx < xfs_iext_count(ifp) - 1) {
+ if (xfs_iext_peek_next_extent(ifp, icur, &RIGHT)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2250,7 +2122,7 @@ xfs_bmap_add_extent_unwritten_real(
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
- newext == RIGHT.br_state &&
+ new->br_state == RIGHT.br_state &&
new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
@@ -2271,24 +2143,20 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left and right neighbors are both contiguous with new.
*/
- --*idx;
+ LEFT.br_blockcount += PREV.br_blockcount + RIGHT.br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- LEFT.br_blockcount + PREV.br_blockcount +
- RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
- xfs_iext_remove(ip, *idx + 1, 2, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
@@ -2303,10 +2171,8 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount + PREV.br_blockcount +
- RIGHT.br_blockcount, LEFT.br_state)))
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
goto done;
}
break;
@@ -2316,23 +2182,19 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- --*idx;
-
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ LEFT.br_blockcount += PREV.br_blockcount;
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &PREV, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
@@ -2341,10 +2203,8 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount + PREV.br_blockcount,
- LEFT.br_state)))
+ error = xfs_bmbt_update(cur, &LEFT);
+ if (error)
goto done;
}
break;
@@ -2354,21 +2214,22 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount + RIGHT.br_blockcount);
- xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ PREV.br_blockcount += RIGHT.br_blockcount;
+ PREV.br_state = new->br_state;
+
+ xfs_iext_next(ifp, icur);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
- RIGHT.br_startblock,
- RIGHT.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
@@ -2377,10 +2238,8 @@ xfs_bmap_add_extent_unwritten_real(
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount,
- newext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
}
break;
@@ -2391,22 +2250,19 @@ xfs_bmap_add_extent_unwritten_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ PREV.br_state = new->br_state;
+ xfs_iext_update_extent(ip, state, icur, &PREV);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- newext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
}
break;
@@ -2416,43 +2272,32 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
- LEFT.br_blockcount + new->br_blockcount);
- xfs_bmbt_set_startoff(ep,
- PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
-
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startblock(ep,
- new->br_startblock + new->br_blockcount);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
- --*idx;
+ LEFT.br_blockcount += new->br_blockcount;
+
+ old = PREV;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock += new->br_blockcount;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &LEFT);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur,
- PREV.br_startoff + new->br_blockcount,
- PREV.br_startblock + new->br_blockcount,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
- if ((error = xfs_btree_decrement(cur, 0, &i)))
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
goto done;
- error = xfs_bmbt_update(cur, LEFT.br_startoff,
- LEFT.br_startblock,
- LEFT.br_blockcount + new->br_blockcount,
- LEFT.br_state);
+ error = xfs_bmbt_update(cur, &LEFT);
if (error)
goto done;
}
@@ -2463,32 +2308,25 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
- xfs_bmbt_set_startoff(ep, new_endoff);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- xfs_bmbt_set_startblock(ep,
- new->br_startblock + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-
- xfs_iext_insert(ip, *idx, 1, new, state);
+ old = PREV;
+ PREV.br_startoff += new->br_blockcount;
+ PREV.br_startblock += new->br_blockcount;
+ PREV.br_blockcount -= new->br_blockcount;
+
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_insert(ip, icur, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur,
- PREV.br_startoff + new->br_blockcount,
- PREV.br_startblock + new->br_blockcount,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
cur->bc_rec.b = *new;
if ((error = xfs_btree_insert(cur, &i)))
@@ -2502,39 +2340,33 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is contiguous with the new allocation.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = PREV;
+ PREV.br_blockcount -= new->br_blockcount;
- ++*idx;
+ RIGHT.br_startoff = new->br_startoff;
+ RIGHT.br_startblock = new->br_startblock;
+ RIGHT.br_blockcount += new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
- new->br_startoff, new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount, newext);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &RIGHT);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
rval = 0;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock,
- PREV.br_blockcount, &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
- PREV.br_startblock,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
- if ((error = xfs_btree_increment(cur, 0, &i)))
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
goto done;
- if ((error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount + RIGHT.br_blockcount,
- newext)))
+ error = xfs_bmbt_update(cur, &RIGHT);
+ if (error)
goto done;
}
break;
@@ -2544,13 +2376,12 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = PREV;
+ PREV.br_blockcount -= new->br_blockcount;
- ++*idx;
- xfs_iext_insert(ip, *idx, 1, new, state);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
@@ -2558,22 +2389,17 @@ xfs_bmap_add_extent_unwritten_real(
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
- PREV.br_startblock,
- PREV.br_blockcount - new->br_blockcount,
- oldext)))
+ error = xfs_bmbt_update(cur, &PREV);
+ if (error)
goto done;
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2586,20 +2412,20 @@ xfs_bmap_add_extent_unwritten_real(
* newext. Contiguity is impossible here.
* One extent becomes three extents.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep,
- new->br_startoff - PREV.br_startoff);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = PREV;
+ PREV.br_blockcount = new->br_startoff - PREV.br_startoff;
r[0] = *new;
r[1].br_startoff = new_endoff;
r[1].br_blockcount =
- PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ old.br_startoff + old.br_blockcount - new_endoff;
r[1].br_startblock = new->br_startblock + new->br_blockcount;
- r[1].br_state = oldext;
+ r[1].br_state = PREV.br_state;
- ++*idx;
- xfs_iext_insert(ip, *idx, 2, &r[0], state);
+ xfs_iext_update_extent(ip, state, icur, &PREV);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &r[1], state);
+ xfs_iext_insert(ip, icur, &r[0], state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
@@ -2607,20 +2433,16 @@ xfs_bmap_add_extent_unwritten_real(
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
rval = XFS_ILOG_CORE;
- if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
- PREV.br_startblock, PREV.br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
/* new right extent - oldext */
- if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
- r[1].br_startblock, r[1].br_blockcount,
- r[1].br_state)))
+ error = xfs_bmbt_update(cur, &r[1]);
+ if (error)
goto done;
/* new left extent - oldext */
cur->bc_rec.b = PREV;
- cur->bc_rec.b.br_blockcount =
- new->br_startoff - PREV.br_startoff;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2629,13 +2451,11 @@ xfs_bmap_add_extent_unwritten_real(
* we are about to insert as we can't trust it after
* the previous insert.
*/
- if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
- new->br_startblock, new->br_blockcount,
- &i)))
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
+ if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
/* new middle extent - newext */
- cur->bc_rec.b.br_state = new->br_state;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2694,7 +2514,7 @@ STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
- xfs_extnum_t *idx, /* extent number to update/insert */
+ struct xfs_iext_cursor *icur,
xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -2702,22 +2522,17 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state; /* state bits, accessed thru macros */
- xfs_filblks_t temp=0; /* temp for indirect calculations */
+ int state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t temp; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, whichfork);
- state = 0;
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
- if (*idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
-
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2726,10 +2541,8 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (*idx < xfs_iext_count(ifp)) {
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
-
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2761,22 +2574,20 @@ xfs_bmap_add_extent_hole_delay(
* on the left and on the right.
* Merge all three into a single extent record.
*/
- --*idx;
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
- nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ left.br_startblock = nullstartblock(newlen);
+ left.br_blockcount = temp;
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_LEFT_CONTIG:
@@ -2785,18 +2596,17 @@ xfs_bmap_add_extent_hole_delay(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --*idx;
temp = left.br_blockcount + new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
- nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ left.br_blockcount = temp;
+ left.br_startblock = nullstartblock(newlen);
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_RIGHT_CONTIG:
@@ -2805,16 +2615,15 @@ xfs_bmap_add_extent_hole_delay(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
- new->br_startoff,
- nullstartblock((int)newlen), temp, right.br_state);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = nullstartblock(newlen);
+ right.br_blockcount = temp;
+ xfs_iext_update_extent(ip, state, icur, &right);
break;
case 0:
@@ -2824,7 +2633,7 @@ xfs_bmap_add_extent_hole_delay(
* Insert a new entry.
*/
oldlen = newlen = 0;
- xfs_iext_insert(ip, *idx, 1, new, state);
+ xfs_iext_insert(ip, icur, new, state);
break;
}
if (oldlen != newlen) {
@@ -2845,7 +2654,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork,
- xfs_extnum_t *idx,
+ struct xfs_iext_cursor *icur,
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
xfs_fsblock_t *first,
@@ -2860,27 +2669,19 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state; /* state bits, accessed thru macros */
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_bmbt_irec old;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
- state = 0;
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (*idx > 0) {
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2889,9 +2690,8 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (*idx < xfs_iext_count(ifp)) {
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2928,14 +2728,11 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- left.br_blockcount + new->br_blockcount +
- right.br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ left.br_blockcount += new->br_blockcount + right.br_blockcount;
- xfs_iext_remove(ip, *idx + 1, 1, state);
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -2943,9 +2740,7 @@ xfs_bmap_add_extent_hole_real(
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
- right.br_startblock, right.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &right, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -2957,12 +2752,7 @@ xfs_bmap_add_extent_hole_real(
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(cur, left.br_startoff,
- left.br_startblock,
- left.br_blockcount +
- new->br_blockcount +
- right.br_blockcount,
- left.br_state);
+ error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
}
@@ -2974,27 +2764,21 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --*idx;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
- left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = left;
+ left.br_blockcount += new->br_blockcount;
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
- left.br_startblock, left.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(cur, left.br_startoff,
- left.br_startblock,
- left.br_blockcount +
- new->br_blockcount,
- left.br_state);
+ error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
}
@@ -3006,29 +2790,22 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
- new->br_startoff, new->br_startblock,
- new->br_blockcount + right.br_blockcount,
- right.br_state);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ old = right;
+
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = new->br_startblock;
+ right.br_blockcount += new->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &right);
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(cur,
- right.br_startoff,
- right.br_startblock,
- right.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(cur, new->br_startoff,
- new->br_startblock,
- new->br_blockcount +
- right.br_blockcount,
- right.br_state);
+ error = xfs_bmbt_update(cur, &right);
if (error)
goto done;
}
@@ -3040,21 +2817,17 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(ip, *idx, 1, new, state);
+ xfs_iext_insert(ip, icur, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(cur,
- new->br_startoff,
- new->br_startblock,
- new->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- cur->bc_rec.b.br_state = new->br_state;
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
@@ -3865,6 +3638,17 @@ xfs_trim_extent(
}
}
+/* trim extent to within eof */
+void
+xfs_trim_extent_eof(
+ struct xfs_bmbt_irec *irec,
+ struct xfs_inode *ip)
+
+{
+ xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
+ i_size_read(VFS_I(ip))));
+}
+
/*
* Trim the returned map to the required bounds
*/
@@ -3983,7 +3767,7 @@ xfs_bmapi_read(
struct xfs_bmbt_irec got;
xfs_fileoff_t obno;
xfs_fileoff_t end;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int error;
bool eof = false;
int n = 0;
@@ -4025,7 +3809,7 @@ xfs_bmapi_read(
return error;
}
- if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got))
eof = true;
end = bno + len;
obno = bno;
@@ -4057,7 +3841,7 @@ xfs_bmapi_read(
break;
/* Else go on to the next record. */
- if (!xfs_iext_get_extent(ifp, ++idx, &got))
+ if (!xfs_iext_next_extent(ifp, &icur, &got))
eof = true;
}
*nmap = n;
@@ -4085,7 +3869,7 @@ xfs_bmapi_reserve_delalloc(
xfs_filblks_t len,
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
- xfs_extnum_t *lastx,
+ struct xfs_iext_cursor *icur,
int eof)
{
struct xfs_mount *mp = ip->i_mount;
@@ -4115,7 +3899,7 @@ xfs_bmapi_reserve_delalloc(
if (extsz) {
struct xfs_bmbt_irec prev;
- if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev))
+ if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof,
@@ -4164,7 +3948,7 @@ xfs_bmapi_reserve_delalloc(
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
- xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
/*
* Tag the inode if blocks were preallocated. Note that COW fork
@@ -4209,10 +3993,7 @@ xfs_bmapi_allocate(
if (bma->wasdel) {
bma->length = (xfs_extlen_t)bma->got.br_blockcount;
bma->offset = bma->got.br_startoff;
- if (bma->idx) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
- &bma->prev);
- }
+ xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev);
} else {
bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
if (!bma->eof)
@@ -4297,7 +4078,7 @@ xfs_bmapi_allocate(
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
- whichfork, &bma->idx, &bma->cur, &bma->got,
+ whichfork, &bma->icur, &bma->cur, &bma->got,
bma->firstblock, bma->dfops, &bma->logflags);
bma->logflags |= tmp_logflags;
@@ -4309,7 +4090,7 @@ xfs_bmapi_allocate(
* or xfs_bmap_add_extent_hole_real might have merged it into one of
* the neighbouring ones.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+ xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
ASSERT(bma->got.br_startoff <= bma->offset);
ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
@@ -4367,8 +4148,8 @@ xfs_bmapi_convert_unwritten(
}
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
- &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
- &tmp_logflags);
+ &bma->icur, &bma->cur, mval, bma->firstblock,
+ bma->dfops, &tmp_logflags);
/*
* Log the inode core unconditionally in the unwritten extent conversion
* path because the conversion might not have done so (e.g., if the
@@ -4390,7 +4171,7 @@ xfs_bmapi_convert_unwritten(
* xfs_bmap_add_extent_unwritten_real might have merged it into one
* of the neighbouring ones.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+ xfs_iext_get_extent(ifp, &bma->icur, &bma->got);
/*
* We may have combined previously unwritten space with written space,
@@ -4509,9 +4290,9 @@ xfs_bmapi_write(
end = bno + len;
obno = bno;
- if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got))
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
- if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev))
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
bma.tp = tp;
bma.ip = ip;
@@ -4553,7 +4334,8 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if (need_alloc || wasdelay) {
+ if ((need_alloc || wasdelay) &&
+ !(flags & XFS_BMAPI_CONVERT_ONLY)) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4616,7 +4398,7 @@ xfs_bmapi_write(
/* Else go on to the next record. */
bma.prev = bma.got;
- if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got))
+ if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
eof = true;
}
*nmap = n;
@@ -4689,7 +4471,7 @@ xfs_bmapi_remap(
struct xfs_btree_cur *cur = NULL;
xfs_fsblock_t firstblock = NULLFSBLOCK;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int logflags = 0, error;
ASSERT(len > 0);
@@ -4713,7 +4495,7 @@ xfs_bmapi_remap(
return error;
}
- if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) {
+ if (xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
/* make sure we only reflink into a hole. */
ASSERT(got.br_startoff > bno);
ASSERT(got.br_startoff - bno >= len);
@@ -4734,8 +4516,8 @@ xfs_bmapi_remap(
got.br_blockcount = len;
got.br_state = XFS_EXT_NORM;
- error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur,
- &got, &firstblock, dfops, &logflags);
+ error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur,
+ &cur, &got, &firstblock, dfops, &logflags);
if (error)
goto error0;
@@ -4851,7 +4633,7 @@ int
xfs_bmap_del_extent_delay(
struct xfs_inode *ip,
int whichfork,
- xfs_extnum_t *idx,
+ struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del)
{
@@ -4861,7 +4643,8 @@ xfs_bmap_del_extent_delay(
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int error = 0, state = 0;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ int error = 0;
bool isrt;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4872,8 +4655,6 @@ xfs_bmap_del_extent_delay(
da_old = startblockval(got->br_startblock);
da_new = 0;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
@@ -4897,46 +4678,39 @@ xfs_bmap_del_extent_delay(
return error;
ip->i_delayed_blks -= del->br_blockcount;
- if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
if (got->br_startoff == del->br_startoff)
- state |= BMAP_LEFT_CONTIG;
+ state |= BMAP_LEFT_FILLING;
if (got_endoff == del_endoff)
- state |= BMAP_RIGHT_CONTIG;
+ state |= BMAP_RIGHT_FILLING;
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_iext_remove(ip, *idx, 1, state);
- --*idx;
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
break;
- case BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_FILLING:
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_startoff = del_endoff;
got->br_blockcount -= del->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
got->br_blockcount), da_old);
got->br_startblock = nullstartblock((int)da_new);
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
- case BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING:
/*
* Deleting the last part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount = got->br_blockcount - del->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
got->br_blockcount), da_old);
got->br_startblock = nullstartblock((int)da_new);
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
case 0:
/*
@@ -4948,8 +4722,6 @@ xfs_bmap_del_extent_delay(
* Warn if either of the new indlen reservations is zero as this
* can lead to delalloc problems.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-
got->br_blockcount = del->br_startoff - got->br_startoff;
got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount);
@@ -4961,15 +4733,14 @@ xfs_bmap_del_extent_delay(
del->br_blockcount);
got->br_startblock = nullstartblock((int)got_indlen);
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_);
new.br_startoff = del_endoff;
new.br_state = got->br_state;
new.br_startblock = nullstartblock((int)new_indlen);
- ++*idx;
- xfs_iext_insert(ip, *idx, 1, &new, state);
+ xfs_iext_update_extent(ip, state, icur, got);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
da_new = got_indlen + new_indlen - stolen;
del->br_blockcount -= stolen;
@@ -4988,7 +4759,7 @@ xfs_bmap_del_extent_delay(
void
xfs_bmap_del_extent_cow(
struct xfs_inode *ip,
- xfs_extnum_t *idx,
+ struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del)
{
@@ -5003,75 +4774,67 @@ xfs_bmap_del_extent_cow(
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got->br_startoff + got->br_blockcount;
- ASSERT(*idx >= 0);
- ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
ASSERT(!isnullstartblock(got->br_startblock));
if (got->br_startoff == del->br_startoff)
- state |= BMAP_LEFT_CONTIG;
+ state |= BMAP_LEFT_FILLING;
if (got_endoff == del_endoff)
- state |= BMAP_RIGHT_CONTIG;
+ state |= BMAP_RIGHT_FILLING;
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_iext_remove(ip, *idx, 1, state);
- --*idx;
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
break;
- case BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_FILLING:
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_startoff = del_endoff;
got->br_blockcount -= del->br_blockcount;
got->br_startblock = del->br_startblock + del->br_blockcount;
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
- case BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING:
/*
* Deleting the last part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount -= del->br_blockcount;
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_update_extent(ip, state, icur, got);
break;
case 0:
/*
* Deleting the middle of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount = del->br_startoff - got->br_startoff;
- xfs_iext_update_extent(ifp, *idx, got);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
new.br_startoff = del_endoff;
new.br_blockcount = got_endoff - del_endoff;
new.br_state = got->br_state;
new.br_startblock = del->br_startblock + del->br_blockcount;
- ++*idx;
- xfs_iext_insert(ip, *idx, 1, &new, state);
+ xfs_iext_update_extent(ip, state, icur, got);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
break;
}
}
/*
* Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
+ * after removing space.
*/
STATIC int /* error */
-xfs_bmap_del_extent(
+xfs_bmap_del_extent_real(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t *idx, /* extent number to update/delete */
+ struct xfs_iext_cursor *icur,
struct xfs_defer_ops *dfops, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
@@ -5079,16 +4842,12 @@ xfs_bmap_del_extent(
int whichfork, /* data or attr fork */
int bflags) /* bmapi flags */
{
- xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
- xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
- int delay; /* current block is delayed allocated */
int do_fx; /* free extent at end of routine */
- xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
int error; /* error return value */
- int flags; /* inode logging flags */
- xfs_bmbt_irec_t got; /* current extent entry */
+ int flags = 0;/* inode logging flags */
+ struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -5097,103 +4856,81 @@ xfs_bmap_del_extent(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- xfs_filblks_t temp; /* for indirect length calculations */
- xfs_filblks_t temp2; /* for indirect length calculations */
- int state = 0;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_bmbt_irec old;
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
- if (whichfork == XFS_ATTR_FORK)
- state |= BMAP_ATTRFORK;
- else if (whichfork == XFS_COW_FORK)
- state |= BMAP_COWFORK;
-
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp)));
ASSERT(del->br_blockcount > 0);
- ep = xfs_iext_get_ext(ifp, *idx);
- xfs_bmbt_get_all(ep, &got);
+ xfs_iext_get_extent(ifp, icur, &got);
ASSERT(got.br_startoff <= del->br_startoff);
del_endoff = del->br_startoff + del->br_blockcount;
got_endoff = got.br_startoff + got.br_blockcount;
ASSERT(got_endoff >= del_endoff);
- delay = isnullstartblock(got.br_startblock);
- ASSERT(isnullstartblock(del->br_startblock) == delay);
- flags = 0;
+ ASSERT(!isnullstartblock(got.br_startblock));
qfield = 0;
error = 0;
+
/*
- * If deleting a real allocation, must free up the disk space.
+ * If it's the case where the directory code is running with no block
+ * reservation, and the deleted block is in the middle of its extent,
+ * and the resulting insert of an extent would cause transformation to
+ * btree format, then reject it. The calling code will then swap blocks
+ * around instead. We have to do this now, rather than waiting for the
+ * conversion to btree format, since the transaction will be dirty then.
*/
- if (!delay) {
- flags = XFS_ILOG_CORE;
- /*
- * Realtime allocation. Free it and record di_nblocks update.
- */
- if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
- xfs_fsblock_t bno;
- xfs_filblks_t len;
-
- ASSERT(do_mod(del->br_blockcount,
- mp->m_sb.sb_rextsize) == 0);
- ASSERT(do_mod(del->br_startblock,
- mp->m_sb.sb_rextsize) == 0);
- bno = del->br_startblock;
- len = del->br_blockcount;
- do_div(bno, mp->m_sb.sb_rextsize);
- do_div(len, mp->m_sb.sb_rextsize);
- error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
- if (error)
- goto done;
- do_fx = 0;
- nblks = len * mp->m_sb.sb_rextsize;
- qfield = XFS_TRANS_DQ_RTBCOUNT;
- }
- /*
- * Ordinary allocation.
- */
- else {
- do_fx = 1;
- nblks = del->br_blockcount;
- qfield = XFS_TRANS_DQ_BCOUNT;
- }
- /*
- * Set up del_endblock and cur for later.
- */
- del_endblock = del->br_startblock + del->br_blockcount;
- if (cur) {
- if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock, got.br_blockcount,
- &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- }
- da_old = da_new = 0;
- } else {
- da_old = startblockval(got.br_startblock);
- da_new = 0;
- nblks = 0;
+ if (tp->t_blk_res == 0 &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >=
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
+ del->br_startoff > got.br_startoff && del_endoff < got_endoff)
+ return -ENOSPC;
+
+ flags = XFS_ILOG_CORE;
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ xfs_fsblock_t bno;
+ xfs_filblks_t len;
+
+ ASSERT(do_mod(del->br_blockcount, mp->m_sb.sb_rextsize) == 0);
+ ASSERT(do_mod(del->br_startblock, mp->m_sb.sb_rextsize) == 0);
+ bno = del->br_startblock;
+ len = del->br_blockcount;
+ do_div(bno, mp->m_sb.sb_rextsize);
+ do_div(len, mp->m_sb.sb_rextsize);
+ error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+ if (error)
+ goto done;
do_fx = 0;
+ nblks = len * mp->m_sb.sb_rextsize;
+ qfield = XFS_TRANS_DQ_RTBCOUNT;
+ } else {
+ do_fx = 1;
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
}
- /*
- * Set flag value to use in switch statement.
- * Left-contig is 2, right-contig is 1.
- */
- switch (((got.br_startoff == del->br_startoff) << 1) |
- (got_endoff == del_endoff)) {
- case 3:
+ del_endblock = del->br_startblock + del->br_blockcount;
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ }
+
+ if (got.br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ switch (state & (BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) {
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
/*
* Matches the whole extent. Delete the entry.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, *idx, 1,
- whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
- --*idx;
- if (delay)
- break;
-
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
flags |= XFS_ILOG_CORE;
@@ -5205,168 +4942,106 @@ xfs_bmap_del_extent(
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
break;
-
- case 2:
+ case BMAP_LEFT_FILLING:
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, del_endoff);
- temp = got.br_blockcount - del->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ got.br_startoff = del_endoff;
+ got.br_startblock = del_endblock;
+ got.br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
}
- if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
goto done;
break;
-
- case 1:
+ case BMAP_RIGHT_FILLING:
/*
* Deleting the last part of the extent.
*/
- temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ got.br_blockcount -= del->br_blockcount;
+ xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
}
- if ((error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount - del->br_blockcount,
- got.br_state)))
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
goto done;
break;
-
case 0:
/*
* Deleting the middle of the extent.
*/
- temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
+ old = got;
+
+ got.br_blockcount = del->br_startoff - got.br_startoff;
+ xfs_iext_update_extent(ip, state, icur, &got);
+
new.br_startoff = del_endoff;
- temp2 = got_endoff - del_endoff;
- new.br_blockcount = temp2;
+ new.br_blockcount = got_endoff - del_endoff;
new.br_state = got.br_state;
- if (!delay) {
- new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
- if (cur) {
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock, temp,
- got.br_state)))
- goto done;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto done;
- cur->bc_rec.b = new;
- error = xfs_btree_insert(cur, &i);
- if (error && error != -ENOSPC)
- goto done;
+ new.br_startblock = del_endblock;
+
+ flags |= XFS_ILOG_CORE;
+ if (cur) {
+ error = xfs_bmbt_update(cur, &got);
+ if (error)
+ goto done;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto done;
+ cur->bc_rec.b = new;
+ error = xfs_btree_insert(cur, &i);
+ if (error && error != -ENOSPC)
+ goto done;
+ /*
+ * If get no-space back from btree insert, it tried a
+ * split, and we have a zero block reservation. Fix up
+ * our state and return the error.
+ */
+ if (error == -ENOSPC) {
/*
- * If get no-space back from btree insert,
- * it tried a split, and we have a zero
- * block reservation.
- * Fix up our state and return the error.
+ * Reset the cursor, don't trust it after any
+ * insert operation.
*/
- if (error == -ENOSPC) {
- /*
- * Reset the cursor, don't trust
- * it after any insert operation.
- */
- if ((error = xfs_bmbt_lookup_eq(cur,
- got.br_startoff,
- got.br_startblock,
- temp, &i)))
- goto done;
- XFS_WANT_CORRUPTED_GOTO(mp,
- i == 1, done);
- /*
- * Update the btree record back
- * to the original value.
- */
- if ((error = xfs_bmbt_update(cur,
- got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state)))
- goto done;
- /*
- * Reset the extent record back
- * to the original value.
- */
- xfs_bmbt_set_blockcount(ep,
- got.br_blockcount);
- flags = 0;
- error = -ENOSPC;
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
+ if (error)
goto done;
- }
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- } else
- flags |= xfs_ilog_fext(whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
- } else {
- xfs_filblks_t stolen;
- ASSERT(whichfork == XFS_DATA_FORK);
-
- /*
- * Distribute the original indlen reservation across the
- * two new extents. Steal blocks from the deleted extent
- * if necessary. Stealing blocks simply fudges the
- * fdblocks accounting in xfs_bunmapi().
- */
- temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
- temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
- stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
- del->br_blockcount);
- da_new = temp + temp2 - stolen;
- del->br_blockcount -= stolen;
-
- /*
- * Set the reservation for each extent. Warn if either
- * is zero as this can lead to delalloc problems.
- */
- WARN_ON_ONCE(!temp || !temp2);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- new.br_startblock = nullstartblock((int)temp2);
- }
- trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_insert(ip, *idx + 1, 1, &new, state);
- ++*idx;
+ /*
+ * Update the btree record back
+ * to the original value.
+ */
+ error = xfs_bmbt_update(cur, &old);
+ if (error)
+ goto done;
+ /*
+ * Reset the extent record back
+ * to the original value.
+ */
+ xfs_iext_update_extent(ip, state, icur, &old);
+ flags = 0;
+ error = -ENOSPC;
+ goto done;
+ }
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ } else
+ flags |= xfs_ilog_fext(whichfork);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ xfs_iext_next(ifp, icur);
+ xfs_iext_insert(ip, icur, &new, state);
break;
}
/* remove reverse mapping */
- if (!delay) {
- error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
- if (error)
- goto done;
- }
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, del);
+ if (error)
+ goto done;
/*
* If we need to, add to list of extents to delete.
@@ -5392,13 +5067,6 @@ xfs_bmap_del_extent(
if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
- /*
- * Account for change in delayed indirect blocks.
- * Nothing to do for disk quota accounting here.
- */
- ASSERT(da_old >= da_new);
- if (da_old > da_new)
- xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
done:
*logflagsp = flags;
return error;
@@ -5414,7 +5082,7 @@ int /* error */
__xfs_bunmapi(
xfs_trans_t *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
- xfs_fileoff_t bno, /* starting offset to unmap */
+ xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
int flags, /* misc flags */
xfs_extnum_t nexts, /* number of extents max */
@@ -5429,11 +5097,9 @@ __xfs_bunmapi(
xfs_bmbt_irec_t got; /* current extent record */
xfs_ifork_t *ifp; /* inode fork pointer */
int isrt; /* freeing in rt area */
- xfs_extnum_t lastx; /* last extent index used */
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
xfs_mount_t *mp; /* mount structure */
- xfs_fileoff_t start; /* first file offset deleted */
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
@@ -5441,8 +5107,11 @@ __xfs_bunmapi(
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t max_len;
xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
+ xfs_fileoff_t end;
+ struct xfs_iext_cursor icur;
+ bool done = false;
- trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+ trace_xfs_bunmap(ip, start, len, flags, _RET_IP_);
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
@@ -5481,18 +5150,13 @@ __xfs_bunmapi(
}
XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
- start = bno;
- bno = start + len - 1;
+ end = start + len;
- /*
- * Check to see if the given block number is past the end of the
- * file, back up to the last block if so...
- */
- if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) {
- ASSERT(lastx > 0);
- xfs_iext_get_extent(ifp, --lastx, &got);
- bno = got.br_startoff + got.br_blockcount - 1;
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
+ *rlen = 0;
+ return 0;
}
+ end--;
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
@@ -5515,24 +5179,24 @@ __xfs_bunmapi(
}
extno = 0;
- while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+ while (end != (xfs_fileoff_t)-1 && end >= start &&
(nexts == 0 || extno < nexts) && max_len > 0) {
/*
- * Is the found extent after a hole in which bno lives?
+ * Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
*/
- if (got.br_startoff > bno) {
- if (--lastx < 0)
- break;
- xfs_iext_get_extent(ifp, lastx, &got);
+ if (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
}
/*
* Is the last block of this extent before the range
* we're supposed to delete? If so, we're done.
*/
- bno = XFS_FILEOFF_MIN(bno,
+ end = XFS_FILEOFF_MIN(end,
got.br_startoff + got.br_blockcount - 1);
- if (bno < start)
+ if (end < start)
break;
/*
* Then deal with the (possibly delayed) allocated space
@@ -5557,8 +5221,8 @@ __xfs_bunmapi(
if (!wasdel)
del.br_startblock += start - got.br_startoff;
}
- if (del.br_startoff + del.br_blockcount > bno + 1)
- del.br_blockcount = bno + 1 - del.br_startoff;
+ if (del.br_startoff + del.br_blockcount > end + 1)
+ del.br_blockcount = end + 1 - del.br_startoff;
/* How much can we safely unmap? */
if (max_len < del.br_blockcount) {
@@ -5584,13 +5248,13 @@ __xfs_bunmapi(
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
*/
- ASSERT(bno >= mod);
- bno -= mod > del.br_blockcount ?
+ ASSERT(end >= mod);
+ end -= mod > del.br_blockcount ?
del.br_blockcount : mod;
- if (bno < got.br_startoff) {
- if (--lastx >= 0)
- xfs_bmbt_get_all(xfs_iext_get_ext(
- ifp, lastx), &got);
+ if (end < got.br_startoff &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
}
continue;
}
@@ -5611,7 +5275,7 @@ __xfs_bunmapi(
}
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp, ip,
- whichfork, &lastx, &cur, &del,
+ whichfork, &icur, &cur, &del,
firstblock, dfops, &logflags);
if (error)
goto error0;
@@ -5636,10 +5300,13 @@ __xfs_bunmapi(
* Can't make it unwritten. There isn't
* a full extent here so just skip it.
*/
- ASSERT(bno >= del.br_blockcount);
- bno -= del.br_blockcount;
- if (got.br_startoff > bno && --lastx >= 0)
- xfs_iext_get_extent(ifp, lastx, &got);
+ ASSERT(end >= del.br_blockcount);
+ end -= del.br_blockcount;
+ if (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got)) {
+ done = true;
+ break;
+ }
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
struct xfs_bmbt_irec prev;
@@ -5650,8 +5317,8 @@ __xfs_bunmapi(
* Unwrite the killed part of that one and
* try again.
*/
- ASSERT(lastx > 0);
- xfs_iext_get_extent(ifp, lastx - 1, &prev);
+ if (!xfs_iext_prev_extent(ifp, &icur, &prev))
+ ASSERT(0);
ASSERT(prev.br_state == XFS_EXT_NORM);
ASSERT(!isnullstartblock(prev.br_startblock));
ASSERT(del.br_startblock ==
@@ -5663,9 +5330,8 @@ __xfs_bunmapi(
prev.br_startoff = start;
}
prev.br_state = XFS_EXT_UNWRITTEN;
- lastx--;
error = xfs_bmap_add_extent_unwritten_real(tp,
- ip, whichfork, &lastx, &cur,
+ ip, whichfork, &icur, &cur,
&prev, firstblock, dfops,
&logflags);
if (error)
@@ -5675,7 +5341,7 @@ __xfs_bunmapi(
ASSERT(del.br_state == XFS_EXT_NORM);
del.br_state = XFS_EXT_UNWRITTEN;
error = xfs_bmap_add_extent_unwritten_real(tp,
- ip, whichfork, &lastx, &cur,
+ ip, whichfork, &icur, &cur,
&del, firstblock, dfops,
&logflags);
if (error)
@@ -5684,85 +5350,39 @@ __xfs_bunmapi(
}
}
- /*
- * If it's the case where the directory code is running
- * with no block reservation, and the deleted block is in
- * the middle of its extent, and the resulting insert
- * of an extent would cause transformation to btree format,
- * then reject it. The calling code will then swap
- * blocks around instead.
- * We have to do this now, rather than waiting for the
- * conversion to btree format, since the transaction
- * will be dirty.
- */
- if (!wasdel && tp->t_blk_res == 0 &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
- XFS_IFORK_MAXEXT(ip, whichfork) &&
- del.br_startoff > got.br_startoff &&
- del.br_startoff + del.br_blockcount <
- got.br_startoff + got.br_blockcount) {
- error = -ENOSPC;
- goto error0;
+ if (wasdel) {
+ error = xfs_bmap_del_extent_delay(ip, whichfork, &icur,
+ &got, &del);
+ } else {
+ error = xfs_bmap_del_extent_real(ip, tp, &icur, dfops,
+ cur, &del, &tmp_logflags, whichfork,
+ flags);
+ logflags |= tmp_logflags;
}
- /*
- * Unreserve quota and update realtime free space, if
- * appropriate. If delayed allocation, update the inode delalloc
- * counter now and wait to update the sb counters as
- * xfs_bmap_del_extent() might need to borrow some blocks.
- */
- if (wasdel) {
- ASSERT(startblockval(del.br_startblock) > 0);
- if (isrt) {
- xfs_filblks_t rtexts;
-
- rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
- do_div(rtexts, mp->m_sb.sb_rextsize);
- xfs_mod_frextents(mp, (int64_t)rtexts);
- (void)xfs_trans_reserve_quota_nblks(NULL,
- ip, -((long)del.br_blockcount), 0,
- XFS_QMOPT_RES_RTBLKS);
- } else {
- (void)xfs_trans_reserve_quota_nblks(NULL,
- ip, -((long)del.br_blockcount), 0,
- XFS_QMOPT_RES_REGBLKS);
- }
- ip->i_delayed_blks -= del.br_blockcount;
- if (cur)
- cur->bc_private.b.flags |=
- XFS_BTCUR_BPRV_WASDEL;
- } else if (cur)
- cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
-
- error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
- &tmp_logflags, whichfork, flags);
- logflags |= tmp_logflags;
if (error)
goto error0;
- if (!isrt && wasdel)
- xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
-
max_len -= del.br_blockcount;
- bno = del.br_startoff - 1;
+ end = del.br_startoff - 1;
nodelete:
/*
* If not done go on to the next (previous) record.
*/
- if (bno != (xfs_fileoff_t)-1 && bno >= start) {
- if (lastx >= 0) {
- xfs_iext_get_extent(ifp, lastx, &got);
- if (got.br_startoff > bno && --lastx >= 0)
- xfs_iext_get_extent(ifp, lastx, &got);
+ if (end != (xfs_fileoff_t)-1 && end >= start) {
+ if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+ (got.br_startoff > end &&
+ !xfs_iext_prev_extent(ifp, &icur, &got))) {
+ done = true;
+ break;
}
extno++;
}
}
- if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0)
+ if (done || end == (xfs_fileoff_t)-1 || end < start)
*rlen = 0;
else
- *rlen = bno - start + 1;
+ *rlen = end - start + 1;
/*
* Convert to a btree if necessary.
@@ -5880,14 +5500,13 @@ xfs_bmse_merge(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t shift, /* shift fsb */
- int current_ext, /* idx of gotp */
+ struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got, /* extent to shift */
struct xfs_bmbt_irec *left, /* preceding extent */
struct xfs_btree_cur *cur,
int *logflags, /* output */
struct xfs_defer_ops *dfops)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
@@ -5915,8 +5534,7 @@ xfs_bmse_merge(
}
/* lookup and remove the extent to merge */
- error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock,
- got->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
@@ -5927,20 +5545,20 @@ xfs_bmse_merge(
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
/* lookup and update size of the previous extent */
- error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock,
- left->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
- error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock,
- new.br_blockcount, new.br_state);
+ error = xfs_bmbt_update(cur, &new);
if (error)
return error;
done:
- xfs_iext_update_extent(ifp, current_ext - 1, &new);
- xfs_iext_remove(ip, current_ext, 1, 0);
+ xfs_iext_remove(ip, icur, 0);
+ xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+ &new);
/* update reverse mapping. rmap functions merge the rmaps for us */
error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got);
@@ -5951,183 +5569,83 @@ done:
return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new);
}
-/*
- * Shift a single extent.
- */
-STATIC int
-xfs_bmse_shift_one(
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t offset_shift_fsb,
- int *current_ext,
- struct xfs_bmbt_irec *got,
- struct xfs_btree_cur *cur,
- int *logflags,
- enum shift_direction direction,
- struct xfs_defer_ops *dfops)
+static int
+xfs_bmap_shift_update_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *got,
+ struct xfs_btree_cur *cur,
+ int *logflags,
+ struct xfs_defer_ops *dfops,
+ xfs_fileoff_t startoff)
{
- struct xfs_ifork *ifp;
- struct xfs_mount *mp;
- xfs_fileoff_t startoff;
- struct xfs_bmbt_irec adj_irec, new;
- int error;
- int i;
- int total_extents;
-
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- total_extents = xfs_iext_count(ifp);
-
- /* delalloc extents should be prevented by caller */
- XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock));
-
- if (direction == SHIFT_LEFT) {
- startoff = got->br_startoff - offset_shift_fsb;
-
- /*
- * Check for merge if we've got an extent to the left,
- * otherwise make sure there's enough room at the start
- * of the file for the shift.
- */
- if (!*current_ext) {
- if (got->br_startoff < offset_shift_fsb)
- return -EINVAL;
- goto update_current_ext;
- }
-
- /*
- * grab the left extent and check for a large enough hole.
- */
- xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec);
- if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount)
- return -EINVAL;
-
- /* check whether to merge the extent or shift it down */
- if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) {
- return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
- *current_ext, got, &adj_irec,
- cur, logflags, dfops);
- }
- } else {
- startoff = got->br_startoff + offset_shift_fsb;
- /* nothing to move if this is the last extent */
- if (*current_ext >= (total_extents - 1))
- goto update_current_ext;
-
- /*
- * If this is not the last extent in the file, make sure there
- * is enough room between current extent and next extent for
- * accommodating the shift.
- */
- xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec);
- if (startoff + got->br_blockcount > adj_irec.br_startoff)
- return -EINVAL;
-
- /*
- * Unlike a left shift (which involves a hole punch),
- * a right shift does not modify extent neighbors
- * in any way. We should never find mergeable extents
- * in this scenario. Check anyways and warn if we
- * encounter two extents that could be one.
- */
- if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb))
- WARN_ON_ONCE(1);
- }
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec prev = *got;
+ int error, i;
- /*
- * Increment the extent index for the next iteration, update the start
- * offset of the in-core extent and update the btree if applicable.
- */
-update_current_ext:
*logflags |= XFS_ILOG_CORE;
- new = *got;
- new.br_startoff = startoff;
+ got->br_startoff = startoff;
if (cur) {
- error = xfs_bmbt_lookup_eq(cur, got->br_startoff,
- got->br_startblock, got->br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
- error = xfs_bmbt_update(cur, new.br_startoff,
- new.br_startblock, new.br_blockcount,
- new.br_state);
+ error = xfs_bmbt_update(cur, got);
if (error)
return error;
} else {
*logflags |= XFS_ILOG_DEXT;
}
- xfs_iext_update_extent(ifp, *current_ext, &new);
-
- if (direction == SHIFT_LEFT)
- (*current_ext)++;
- else
- (*current_ext)--;
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
+ got);
/* update reverse mapping */
- error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got);
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &prev);
if (error)
return error;
- return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new);
+ return xfs_rmap_map_extent(mp, dfops, ip, whichfork, got);
}
-/*
- * Shift extent records to the left/right to cover/create a hole.
- *
- * The maximum number of extents to be shifted in a single operation is
- * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
- * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
- * is the length by which each extent is shifted. If there is no hole to shift
- * the extents into, this will be considered invalid operation and we abort
- * immediately.
- */
int
-xfs_bmap_shift_extents(
+xfs_bmap_collapse_extents(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *next_fsb,
xfs_fileoff_t offset_shift_fsb,
- int *done,
+ bool *done,
xfs_fileoff_t stop_fsb,
xfs_fsblock_t *firstblock,
- struct xfs_defer_ops *dfops,
- enum shift_direction direction,
- int num_exts)
+ struct xfs_defer_ops *dfops)
{
- struct xfs_btree_cur *cur = NULL;
- struct xfs_bmbt_irec got;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
- xfs_extnum_t nexts = 0;
- xfs_extnum_t current_ext;
- xfs_extnum_t total_extents;
- xfs_extnum_t stop_extent;
- int error = 0;
- int whichfork = XFS_DATA_FORK;
- int logflags = 0;
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got, prev;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t new_startoff;
+ int error = 0;
+ int logflags = 0;
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmap_shift_extents",
- XFS_ERRLEVEL_LOW, mp);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
- ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- /* Read in all the extents */
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
@@ -6140,107 +5658,165 @@ xfs_bmap_shift_extents(
cur->bc_private.b.flags = 0;
}
- /*
- * There may be delalloc extents in the data fork before the range we
- * are collapsing out, so we cannot use the count of real extents here.
- * Instead we have to calculate it from the incore fork.
- */
- total_extents = xfs_iext_count(ifp);
- if (total_extents == 0) {
- *done = 1;
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+ *done = true;
goto del_cursor;
}
+ XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
- /*
- * In case of first right shift, we need to initialize next_fsb
- */
- if (*next_fsb == NULLFSBLOCK) {
- ASSERT(direction == SHIFT_RIGHT);
-
- current_ext = total_extents - 1;
- xfs_iext_get_extent(ifp, current_ext, &got);
- if (stop_fsb > got.br_startoff) {
- *done = 1;
+ new_startoff = got.br_startoff - offset_shift_fsb;
+ if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
+ if (new_startoff < prev.br_startoff + prev.br_blockcount) {
+ error = -EINVAL;
goto del_cursor;
}
- *next_fsb = got.br_startoff;
+
+ if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) {
+ error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+ &icur, &got, &prev, cur, &logflags,
+ dfops);
+ if (error)
+ goto del_cursor;
+ goto done;
+ }
} else {
- /*
- * Look up the extent index for the fsb where we start shifting. We can
- * henceforth iterate with current_ext as extent list changes are locked
- * out via ilock.
- *
- * If next_fsb lies in a hole beyond which there are no extents we are
- * done.
- */
- if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &current_ext,
- &got)) {
- *done = 1;
+ if (got.br_startoff < offset_shift_fsb) {
+ error = -EINVAL;
goto del_cursor;
}
}
- /* Lookup the extent index at which we have to stop */
- if (direction == SHIFT_RIGHT) {
- struct xfs_bmbt_irec s;
+ error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur,
+ &logflags, dfops, new_startoff);
+ if (error)
+ goto del_cursor;
+
+done:
+ if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+ *done = true;
+ goto del_cursor;
+ }
- xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s);
- /* Make stop_extent exclusive of shift range */
- stop_extent--;
- if (current_ext <= stop_extent) {
- error = -EIO;
+ *next_fsb = got.br_startoff;
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
+
+int
+xfs_bmap_insert_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ bool *done,
+ xfs_fileoff_t stop_fsb,
+ xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops)
+{
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_irec got, next;
+ struct xfs_iext_cursor icur;
+ xfs_fileoff_t new_startoff;
+ int error = 0;
+ int logflags = 0;
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.dfops = dfops;
+ cur->bc_private.b.flags = 0;
+ }
+
+ if (*next_fsb == NULLFSBLOCK) {
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, &got) ||
+ stop_fsb > got.br_startoff) {
+ *done = true;
goto del_cursor;
}
} else {
- stop_extent = total_extents;
- if (current_ext >= stop_extent) {
- error = -EIO;
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
+ *done = true;
goto del_cursor;
}
}
+ XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
- while (nexts++ < num_exts) {
- error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
- &current_ext, &got, cur, &logflags,
- direction, dfops);
- if (error)
+ if (stop_fsb >= got.br_startoff + got.br_blockcount) {
+ error = -EIO;
+ goto del_cursor;
+ }
+
+ new_startoff = got.br_startoff + offset_shift_fsb;
+ if (xfs_iext_peek_next_extent(ifp, &icur, &next)) {
+ if (new_startoff + got.br_blockcount > next.br_startoff) {
+ error = -EINVAL;
goto del_cursor;
- /*
- * If there was an extent merge during the shift, the extent
- * count can change. Update the total and grade the next record.
- */
- if (direction == SHIFT_LEFT) {
- total_extents = xfs_iext_count(ifp);
- stop_extent = total_extents;
}
- if (current_ext == stop_extent) {
- *done = 1;
- *next_fsb = NULLFSBLOCK;
- break;
- }
- xfs_iext_get_extent(ifp, current_ext, &got);
+ /*
+ * Unlike a left shift (which involves a hole punch), a right
+ * shift does not modify extent neighbors in any way. We should
+ * never find mergeable extents in this scenario. Check anyways
+ * and warn if we encounter two extents that could be one.
+ */
+ if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb))
+ WARN_ON_ONCE(1);
}
- if (!*done)
- *next_fsb = got.br_startoff;
+ error = xfs_bmap_shift_update_extent(ip, whichfork, &icur, &got, cur,
+ &logflags, dfops, new_startoff);
+ if (error)
+ goto del_cursor;
+
+ if (!xfs_iext_prev_extent(ifp, &icur, &got) ||
+ stop_fsb >= got.br_startoff + got.br_blockcount) {
+ *done = true;
+ goto del_cursor;
+ }
+ *next_fsb = got.br_startoff;
del_cursor:
if (cur)
xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-
if (logflags)
xfs_trans_log_inode(tp, ip, logflags);
-
return error;
}
/*
- * Splits an extent into two extents at split_fsb block such that it is
- * the first block of the current_ext. @current_ext is a target extent
- * to be split. @split_fsb is a block where the extents is split.
- * If split_fsb lies in a hole or the first block of extents, just return 0.
+ * Splits an extent into two extents at split_fsb block such that it is the
+ * first block of the current_ext. @ext is a target extent to be split.
+ * @split_fsb is a block where the extents is split. If split_fsb lies in a
+ * hole or the first block of extents, just return 0.
*/
STATIC int
xfs_bmap_split_extent_at(
@@ -6257,7 +5833,7 @@ xfs_bmap_split_extent_at(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
xfs_fsblock_t gotblkcnt; /* new block count for got */
- xfs_extnum_t current_ext;
+ struct xfs_iext_cursor icur;
int error = 0;
int logflags = 0;
int i = 0;
@@ -6285,7 +5861,7 @@ xfs_bmap_split_extent_at(
/*
* If there are not extents, or split_fsb lies in a hole we are done.
*/
- if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &current_ext, &got) ||
+ if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &icur, &got) ||
got.br_startoff >= split_fsb)
return 0;
@@ -6300,44 +5876,35 @@ xfs_bmap_split_extent_at(
cur->bc_private.b.firstblock = *firstfsb;
cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
- error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
}
got.br_blockcount = gotblkcnt;
- xfs_iext_update_extent(ifp, current_ext, &got);
+ xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), &icur,
+ &got);
logflags = XFS_ILOG_CORE;
if (cur) {
- error = xfs_bmbt_update(cur, got.br_startoff,
- got.br_startblock,
- got.br_blockcount,
- got.br_state);
+ error = xfs_bmbt_update(cur, &got);
if (error)
goto del_cursor;
} else
logflags |= XFS_ILOG_DEXT;
/* Add new extent */
- current_ext++;
- xfs_iext_insert(ip, current_ext, 1, &new, 0);
+ xfs_iext_next(ifp, &icur);
+ xfs_iext_insert(ip, &icur, &new, 0);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur) {
- error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
- new.br_startblock, new.br_blockcount,
- &i);
+ error = xfs_bmbt_lookup_eq(cur, &new, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
- cur->bc_rec.b.br_state = new.br_state;
-
error = xfs_btree_insert(cur, &i);
if (error)
goto del_cursor;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 851982a5dfbc..e36d75799cd5 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -43,7 +43,7 @@ struct xfs_bmalloca {
xfs_fsblock_t blkno; /* starting block of new extent */
struct xfs_btree_cur *cur; /* btree cursor */
- xfs_extnum_t idx; /* current extent index */
+ struct xfs_iext_cursor icur; /* incore extent cursor */
int nallocs;/* number of extents alloc'd */
int logflags;/* flags for transaction logging */
@@ -113,6 +113,9 @@ struct xfs_extent_free_item
/* Only convert delalloc space, don't allocate entirely new extents */
#define XFS_BMAPI_DELALLOC 0x400
+/* Only convert unwritten extents, don't allocate new blocks */
+#define XFS_BMAPI_CONVERT_ONLY 0x800
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -124,7 +127,8 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }
+ { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
+ { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }
static inline int xfs_bmapi_aflag(int w)
@@ -183,31 +187,9 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
!isnullstartblock(irec->br_startblock);
}
-/*
- * This macro is used to determine how many extents will be shifted
- * in one write transaction. We could require two splits,
- * an extent move on the first and an extent merge on the second,
- * So it is proper that one extent is shifted inside write transaction
- * at a time.
- */
-#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
-
-enum shift_direction {
- SHIFT_LEFT = 0,
- SHIFT_RIGHT,
-};
-
-#ifdef DEBUG
-void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
- int whichfork, unsigned long caller_ip);
-#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
- xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
-#else
-#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
-
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
+void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
@@ -221,8 +203,6 @@ int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
-int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
- int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
int *nmap, int flags);
@@ -240,20 +220,25 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
- xfs_extnum_t *idx, struct xfs_bmbt_irec *got,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del);
+void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
-void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
- struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
uint xfs_default_attroffset(struct xfs_inode *ip);
-int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+ bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops);
+int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
- int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
- struct xfs_defer_ops *dfops, enum shift_direction direction,
- int num_exts);
+ bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+ struct xfs_defer_ops *dfops);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
+ struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
+ int eof);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
@@ -277,4 +262,16 @@ int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+static inline int xfs_bmap_fork_to_state(int whichfork)
+{
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ return BMAP_ATTRFORK;
+ case XFS_COW_FORK:
+ return BMAP_COWFORK;
+ default:
+ return 0;
+ }
+}
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index a6331ffa51e3..c10aecaaae44 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -38,22 +38,6 @@
#include "xfs_rmap.h"
/*
- * Determine the extent state.
- */
-/* ARGSUSED */
-STATIC xfs_exntst_t
-xfs_extent_state(
- xfs_filblks_t blks,
- int extent_flag)
-{
- if (extent_flag) {
- ASSERT(blks != 0); /* saved for DMIG */
- return XFS_EXT_UNWRITTEN;
- }
- return XFS_EXT_NORM;
-}
-
-/*
* Convert on-disk form of btree root to in-memory form.
*/
void
@@ -87,84 +71,21 @@ xfs_bmdr_to_bmbt(
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
-/*
- * Convert a compressed bmap extent record to an uncompressed form.
- * This code must be in sync with the routines xfs_bmbt_get_startoff,
- * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
- */
-STATIC void
-__xfs_bmbt_get_all(
- uint64_t l0,
- uint64_t l1,
- xfs_bmbt_irec_t *s)
-{
- int ext_flag;
- xfs_exntst_t st;
-
- ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
- s->br_startoff = ((xfs_fileoff_t)l0 &
- xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
- s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
- (((xfs_fsblock_t)l1) >> 21);
- s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
- /* This is xfs_extent_state() in-line */
- if (ext_flag) {
- ASSERT(s->br_blockcount != 0); /* saved for DMIG */
- st = XFS_EXT_UNWRITTEN;
- } else
- st = XFS_EXT_NORM;
- s->br_state = st;
-}
-
void
-xfs_bmbt_get_all(
- xfs_bmbt_rec_host_t *r,
- xfs_bmbt_irec_t *s)
-{
- __xfs_bmbt_get_all(r->l0, r->l1, s);
-}
-
-/*
- * Extract the blockcount field from an in memory bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_get_blockcount(
- xfs_bmbt_rec_host_t *r)
-{
- return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
-}
-
-/*
- * Extract the startblock field from an in memory bmap extent record.
- */
-xfs_fsblock_t
-xfs_bmbt_get_startblock(
- xfs_bmbt_rec_host_t *r)
-{
- return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
- (((xfs_fsblock_t)r->l1) >> 21);
-}
-
-/*
- * Extract the startoff field from an in memory bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_get_startoff(
- xfs_bmbt_rec_host_t *r)
-{
- return ((xfs_fileoff_t)r->l0 &
- xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-xfs_exntst_t
-xfs_bmbt_get_state(
- xfs_bmbt_rec_host_t *r)
-{
- int ext_flag;
-
- ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
- return xfs_extent_state(xfs_bmbt_get_blockcount(r),
- ext_flag);
+xfs_bmbt_disk_get_all(
+ struct xfs_bmbt_rec *rec,
+ struct xfs_bmbt_irec *irec)
+{
+ uint64_t l0 = get_unaligned_be64(&rec->l0);
+ uint64_t l1 = get_unaligned_be64(&rec->l1);
+
+ irec->br_startoff = (l0 & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+ irec->br_startblock = ((l0 & xfs_mask64lo(9)) << 43) | (l1 >> 21);
+ irec->br_blockcount = l1 & xfs_mask64lo(21);
+ if (l0 >> (64 - BMBT_EXNTFLAG_BITLEN))
+ irec->br_state = XFS_EXT_UNWRITTEN;
+ else
+ irec->br_state = XFS_EXT_NORM;
}
/*
@@ -188,142 +109,29 @@ xfs_bmbt_disk_get_startoff(
xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
}
-
-/*
- * Set all the fields in a bmap extent record from the arguments.
- */
-void
-xfs_bmbt_set_allf(
- xfs_bmbt_rec_host_t *r,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state)
-{
- int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
- ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
- ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
- ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-
- ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
- r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9) |
- ((xfs_bmbt_rec_base_t)startblock >> 43);
- r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-}
-
/*
* Set all the fields in a bmap extent record from the uncompressed form.
*/
void
-xfs_bmbt_set_all(
- xfs_bmbt_rec_host_t *r,
- xfs_bmbt_irec_t *s)
-{
- xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
- s->br_blockcount, s->br_state);
-}
-
-
-/*
- * Set all the fields in a disk format bmap extent record from the arguments.
- */
-void
-xfs_bmbt_disk_set_allf(
- xfs_bmbt_rec_t *r,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state)
-{
- int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
-
- ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
- ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
- ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
- ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
-
- r->l0 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)extent_flag << 63) |
- ((xfs_bmbt_rec_base_t)startoff << 9) |
- ((xfs_bmbt_rec_base_t)startblock >> 43));
- r->l1 = cpu_to_be64(
- ((xfs_bmbt_rec_base_t)startblock << 21) |
- ((xfs_bmbt_rec_base_t)blockcount &
- (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-}
-
-/*
- * Set all the fields in a bmap extent record from the uncompressed form.
- */
-STATIC void
xfs_bmbt_disk_set_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
- s->br_blockcount, s->br_state);
-}
-
-/*
- * Set the blockcount field in a bmap extent record.
- */
-void
-xfs_bmbt_set_blockcount(
- xfs_bmbt_rec_host_t *r,
- xfs_filblks_t v)
+ struct xfs_bmbt_rec *r,
+ struct xfs_bmbt_irec *s)
{
- ASSERT((v & xfs_mask64hi(43)) == 0);
- r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
- (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
-}
-
-/*
- * Set the startblock field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startblock(
- xfs_bmbt_rec_host_t *r,
- xfs_fsblock_t v)
-{
- ASSERT((v & xfs_mask64hi(12)) == 0);
- r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
- (xfs_bmbt_rec_base_t)(v >> 43);
- r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
- (xfs_bmbt_rec_base_t)(v << 21);
-}
+ int extent_flag = (s->br_state != XFS_EXT_NORM);
-/*
- * Set the startoff field in a bmap extent record.
- */
-void
-xfs_bmbt_set_startoff(
- xfs_bmbt_rec_host_t *r,
- xfs_fileoff_t v)
-{
- ASSERT((v & xfs_mask64hi(9)) == 0);
- r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
- ((xfs_bmbt_rec_base_t)v << 9) |
- (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-}
+ ASSERT(s->br_state == XFS_EXT_NORM || s->br_state == XFS_EXT_UNWRITTEN);
+ ASSERT(!(s->br_startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)));
+ ASSERT(!(s->br_blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)));
+ ASSERT(!(s->br_startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)));
-/*
- * Set the extent state field in a bmap extent record.
- */
-void
-xfs_bmbt_set_state(
- xfs_bmbt_rec_host_t *r,
- xfs_exntst_t v)
-{
- ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
- if (v == XFS_EXT_NORM)
- r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
- else
- r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+ put_unaligned_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+ ((xfs_bmbt_rec_base_t)s->br_startblock >> 43), &r->l0);
+ put_unaligned_be64(
+ ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+ ((xfs_bmbt_rec_base_t)s->br_blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)), &r->l1);
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 9da5a8d4f184..135b8c56d23e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -98,25 +98,11 @@ struct xfs_trans;
*/
extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
struct xfs_btree_block *, int);
-extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
-extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
-extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
-extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-
-extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
- xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
-extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
-extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
-extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
-
-extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
- xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
@@ -136,9 +122,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
* Check that the extent does not contain an invalid unwritten extent flag.
*/
static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
- struct xfs_bmbt_rec_host *ep)
+ struct xfs_bmbt_irec *irec)
{
- if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
+ if (irec->br_state == XFS_EXT_NORM)
return true;
if (whichfork == XFS_DATA_FORK &&
xfs_sb_version_hasextflgbit(&mp->m_sb))
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5bfb88261c7e..5f33adf8eecb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -29,6 +29,7 @@
#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
@@ -63,44 +64,63 @@ xfs_btree_magic(
return magic;
}
-STATIC int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
- struct xfs_btree_cur *cur, /* btree cursor */
- struct xfs_btree_block *block, /* btree long form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp) /* buffer for block, if any */
+/*
+ * Check a long btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
{
- int lblock_ok = 1; /* block passes checks */
- struct xfs_mount *mp; /* file system mount point */
+ struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc;
-
- mp = cur->bc_mp;
- crc = xfs_sb_version_hascrc(&mp->m_sb);
+ int crc = xfs_sb_version_hascrc(&mp->m_sb);
if (crc) {
- lblock_ok = lblock_ok &&
- uuid_equal(&block->bb_u.l.bb_uuid,
- &mp->m_sb.sb_meta_uuid) &&
- block->bb_u.l.bb_blkno == cpu_to_be64(
- bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.l.bb_blkno !=
+ cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ return __this_address;
+ if (block->bb_u.l.bb_pad != cpu_to_be32(0))
+ return __this_address;
}
- lblock_ok = lblock_ok &&
- be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
- be16_to_cpu(block->bb_level) == level &&
- be16_to_cpu(block->bb_numrecs) <=
- cur->bc_ops->get_maxrecs(cur, level) &&
- block->bb_u.l.bb_leftsib &&
- (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
- block->bb_u.l.bb_rightsib &&
- (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
- XFS_FSB_SANITY_CHECK(mp,
- be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
- if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ return __this_address;
+ if (be16_to_cpu(block->bb_level) != level)
+ return __this_address;
+ if (be16_to_cpu(block->bb_numrecs) >
+ cur->bc_ops->get_maxrecs(cur, level))
+ return __this_address;
+ if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
+ !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
+ level + 1))
+ return __this_address;
+ if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
+ !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
+ level + 1))
+ return __this_address;
+
+ return NULL;
+}
+
+/* Check a long btree block header. */
+static int
+xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+
+ fa = __xfs_btree_check_lblock(cur, block, level, bp);
+ if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -110,48 +130,61 @@ xfs_btree_check_lblock(
return 0;
}
-STATIC int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
- struct xfs_btree_cur *cur, /* btree cursor */
- struct xfs_btree_block *block, /* btree short form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp) /* buffer containing block */
+/*
+ * Check a short btree block header. Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+xfs_failaddr_t
+__xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp; /* file system mount point */
- struct xfs_buf *agbp; /* buffer for ag. freespace struct */
- struct xfs_agf *agf; /* ag. freespace structure */
- xfs_agblock_t agflen; /* native ag. freespace length */
- int sblock_ok = 1; /* block passes checks */
+ struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
- int crc;
-
- mp = cur->bc_mp;
- crc = xfs_sb_version_hascrc(&mp->m_sb);
- agbp = cur->bc_private.a.agbp;
- agf = XFS_BUF_TO_AGF(agbp);
- agflen = be32_to_cpu(agf->agf_length);
+ int crc = xfs_sb_version_hascrc(&mp->m_sb);
if (crc) {
- sblock_ok = sblock_ok &&
- uuid_equal(&block->bb_u.s.bb_uuid,
- &mp->m_sb.sb_meta_uuid) &&
- block->bb_u.s.bb_blkno == cpu_to_be64(
- bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (block->bb_u.s.bb_blkno !=
+ cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
+ return __this_address;
}
- sblock_ok = sblock_ok &&
- be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
- be16_to_cpu(block->bb_level) == level &&
- be16_to_cpu(block->bb_numrecs) <=
- cur->bc_ops->get_maxrecs(cur, level) &&
- (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
- block->bb_u.s.bb_leftsib &&
- (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
- be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
- block->bb_u.s.bb_rightsib;
-
- if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
+ if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+ return __this_address;
+ if (be16_to_cpu(block->bb_level) != level)
+ return __this_address;
+ if (be16_to_cpu(block->bb_numrecs) >
+ cur->bc_ops->get_maxrecs(cur, level))
+ return __this_address;
+ if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
+ !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
+ level + 1))
+ return __this_address;
+ if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
+ !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
+ level + 1))
+ return __this_address;
+
+ return NULL;
+}
+
+/* Check a short btree block header. */
+STATIC int
+xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_failaddr_t fa;
+
+ fa = __xfs_btree_check_sblock(cur, block, level, bp);
+ if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -177,59 +210,53 @@ xfs_btree_check_block(
return xfs_btree_check_sblock(cur, block, level, bp);
}
-/*
- * Check that (long) pointer is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
+/* Check that this long pointer is valid and points within the fs. */
+bool
xfs_btree_check_lptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t bno, /* btree block disk address */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t fsbno,
+ int level)
{
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
- level > 0 &&
- bno != NULLFSBLOCK &&
- XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
- return 0;
+ if (level <= 0)
+ return false;
+ return xfs_verify_fsbno(cur->bc_mp, fsbno);
}
-#ifdef DEBUG
-/*
- * Check that (short) pointer is ok.
- */
-STATIC int /* error (0 or EFSCORRUPTED) */
+/* Check that this short pointer is valid and points within the AG. */
+bool
xfs_btree_check_sptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agblock_t bno, /* btree block disk address */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t agbno,
+ int level)
{
- xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
-
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
- level > 0 &&
- bno != NULLAGBLOCK &&
- bno != 0 &&
- bno < agblocks);
- return 0;
+ if (level <= 0)
+ return false;
+ return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
}
+#ifdef DEBUG
/*
- * Check that block ptr is ok.
+ * Check that a given (indexed) btree pointer at a certain level of a
+ * btree is valid and doesn't point past where it should.
*/
-STATIC int /* error (0 or EFSCORRUPTED) */
+static int
xfs_btree_check_ptr(
- struct xfs_btree_cur *cur, /* btree cursor */
- union xfs_btree_ptr *ptr, /* btree block disk address */
- int index, /* offset from ptr to check */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int index,
+ int level)
{
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- return xfs_btree_check_lptr(cur,
- be64_to_cpu((&ptr->l)[index]), level);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+ xfs_btree_check_lptr(cur,
+ be64_to_cpu((&ptr->l)[index]), level));
} else {
- return xfs_btree_check_sptr(cur,
- be32_to_cpu((&ptr->s)[index]), level);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
+ xfs_btree_check_sptr(cur,
+ be32_to_cpu((&ptr->s)[index]), level));
}
+
+ return 0;
}
#endif
@@ -1027,7 +1054,7 @@ xfs_btree_setbuf(
}
}
-STATIC int
+bool
xfs_btree_ptr_is_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
@@ -1052,7 +1079,7 @@ xfs_btree_set_ptr_null(
/*
* Get/set/init sibling pointers
*/
-STATIC void
+void
xfs_btree_get_sibling(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
@@ -2001,7 +2028,7 @@ error0:
}
/* Find the high key storage area from a regular key. */
-STATIC union xfs_btree_key *
+union xfs_btree_key *
xfs_btree_high_key_from_key(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
@@ -2075,7 +2102,7 @@ xfs_btree_get_node_keys(
}
/* Derive the keys for any btree block. */
-STATIC void
+void
xfs_btree_get_keys(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
@@ -4914,3 +4941,15 @@ xfs_btree_count_blocks(
return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
blocks);
}
+
+/* Compare two btree pointers. */
+int64_t
+xfs_btree_diff_two_ptrs(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
+ return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index f2a88c3b1159..b57501c6f71d 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -255,6 +255,14 @@ typedef struct xfs_btree_cur
*/
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
+/*
+ * Internal long and short btree block checks. They return NULL if the
+ * block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, int level, struct xfs_buf *bp);
/*
* Check that block header is ok.
@@ -269,10 +277,19 @@ xfs_btree_check_block(
/*
* Check that (long) pointer is ok.
*/
-int /* error (0 or EFSCORRUPTED) */
+bool /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_fsblock_t ptr, /* btree block disk address */
+ xfs_fsblock_t fsbno, /* btree block disk address */
+ int level); /* btree block level */
+
+/*
+ * Check that (short) pointer is ok.
+ */
+bool /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t agbno, /* btree block disk address */
int level); /* btree block level */
/*
@@ -517,5 +534,16 @@ int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
+bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr);
+int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b);
+void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr, int lr);
+void xfs_btree_get_keys(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, union xfs_btree_key *key);
+union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index 8211f48b98e6..999a290cfd72 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_CKSUM_H
#define _XFS_CKSUM_H 1
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 6d4335815c3f..651611530d2f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -1466,6 +1466,7 @@ xfs_da3_node_lookup_int(
int max;
int error;
int retval;
+ unsigned int expected_level = 0;
struct xfs_inode *dp = state->args->dp;
args = state->args;
@@ -1474,7 +1475,7 @@ xfs_da3_node_lookup_int(
* Descend thru the B-tree searching each level for the right
* node to use, until the right hashval is found.
*/
- blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
+ blkno = args->geo->leafblk;
for (blk = &state->path.blk[0], state->path.active = 1;
state->path.active <= XFS_DA_NODE_MAXDEPTH;
blk++, state->path.active++) {
@@ -1517,6 +1518,18 @@ xfs_da3_node_lookup_int(
dp->d_ops->node_hdr_from_disk(&nodehdr, node);
btree = dp->d_ops->node_tree_p(node);
+ /* Tree taller than we can handle; bail out! */
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ return -EFSCORRUPTED;
+
+ /* Check the level from the root. */
+ if (blkno == args->geo->leafblk)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ return -EFSCORRUPTED;
+ else
+ expected_level--;
+
max = nodehdr.count;
blk->hashval = be32_to_cpu(btree[max - 1].hashval);
@@ -1562,8 +1575,15 @@ xfs_da3_node_lookup_int(
blk->index = probe;
blkno = be32_to_cpu(btree[probe].before);
}
+
+ /* We can't point back to the root. */
+ if (blkno == args->geo->leafblk)
+ return -EFSCORRUPTED;
}
+ if (expected_level != 0)
+ return -EFSCORRUPTED;
+
/*
* A leaf block that ends in the hashval that we are interested in
* (final hashval == search hashval) means that the next block may
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index ccf9783fd3f0..e10778c102ea 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -30,6 +30,8 @@
#include "xfs_bmap.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -38,7 +40,9 @@ struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
/*
* Convert inode mode to directory entry filetype
*/
-unsigned char xfs_mode_to_ftype(int mode)
+unsigned char
+xfs_mode_to_ftype(
+ int mode)
{
switch (mode & S_IFMT) {
case S_IFREG:
@@ -202,22 +206,8 @@ xfs_dir_ino_validate(
xfs_mount_t *mp,
xfs_ino_t ino)
{
- xfs_agblock_t agblkno;
- xfs_agino_t agino;
- xfs_agnumber_t agno;
- int ino_ok;
- int ioff;
-
- agno = XFS_INO_TO_AGNO(mp, ino);
- agblkno = XFS_INO_TO_AGBNO(mp, ino);
- ioff = XFS_INO_TO_OFFSET(mp, ino);
- agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
- ino_ok =
- agno < mp->m_sb.sb_agcount &&
- agblkno < mp->m_sb.sb_agblocks &&
- agblkno != 0 &&
- ioff < (1 << mp->m_sb.sb_inopblog) &&
- XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+ bool ino_ok = xfs_verify_dir_ino(mp, ino);
+
if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 21c8f8bf94d5..1a8f2cf977ca 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -324,4 +324,21 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
sizeof(struct xfs_dir2_leaf_tail));
}
+/*
+ * The Linux API doesn't pass down the total size of the buffer
+ * we read into down to the filesystem. With the filldir concept
+ * it's not needed for correct information, but the XFS dir2 leaf
+ * code wants an estimate of the buffer size to calculate it's
+ * readahead window and size the buffers used for mapping to
+ * physical blocks.
+ *
+ * Try to give it an estimate that's good enough, maybe at some
+ * point we can change the ->readdir prototype to include the
+ * buffer size. For now we use the current glibc buffer size.
+ * musl libc hardcodes 2k and dietlibc uses PAGE_SIZE.
+ */
+#define XFS_READDIR_BUFSIZE (32768)
+
+unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
new file mode 100644
index 000000000000..bc1789d95152
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2017 Oracle.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_ERRORTAG_H_
+#define __XFS_ERRORTAG_H_
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR 0
+#define XFS_ERRTAG_IFLUSH_1 1
+#define XFS_ERRTAG_IFLUSH_2 2
+#define XFS_ERRTAG_IFLUSH_3 3
+#define XFS_ERRTAG_IFLUSH_4 4
+#define XFS_ERRTAG_IFLUSH_5 5
+#define XFS_ERRTAG_IFLUSH_6 6
+#define XFS_ERRTAG_DA_READ_BUF 7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
+#define XFS_ERRTAG_ALLOC_READ_AGF 10
+#define XFS_ERRTAG_IALLOC_READ_AGI 11
+#define XFS_ERRTAG_ITOBP_INOTOBP 12
+#define XFS_ERRTAG_IUNLINK 13
+#define XFS_ERRTAG_IUNLINK_REMOVE 14
+#define XFS_ERRTAG_DIR_INO_VALIDATE 15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
+#define XFS_ERRTAG_IODONE_IOERR 17
+#define XFS_ERRTAG_STRATREAD_IOERR 18
+#define XFS_ERRTAG_STRATCMPL_IOERR 19
+#define XFS_ERRTAG_DIOWRITE_IOERR 20
+#define XFS_ERRTAG_BMAPIFORMAT 21
+#define XFS_ERRTAG_FREE_EXTENT 22
+#define XFS_ERRTAG_RMAP_FINISH_ONE 23
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
+#define XFS_ERRTAG_BMAP_FINISH_ONE 26
+#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+/*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are silenty dropped and handled as if they failed.
+ * All delalloc blocks in the range of the write (including pre-existing
+ * delalloc blocks!) are tossed as part of the write failure error
+ * handling sequence.
+ */
+#define XFS_ERRTAG_DROP_WRITES 28
+#define XFS_ERRTAG_LOG_BAD_CRC 29
+#define XFS_ERRTAG_LOG_ITEM_PIN 30
+#define XFS_ERRTAG_BUF_LRU_REF 31
+#define XFS_ERRTAG_MAX 32
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT 100
+#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_FREE_EXTENT 1
+#define XFS_RANDOM_RMAP_FINISH_ONE 1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
+#define XFS_RANDOM_BMAP_FINISH_ONE 1
+#define XFS_RANDOM_AG_RESV_CRITICAL 4
+#define XFS_RANDOM_DROP_WRITES 1
+#define XFS_RANDOM_LOG_BAD_CRC 1
+#define XFS_RANDOM_LOG_ITEM_PIN 1
+#define XFS_RANDOM_BUF_LRU_REF 2
+
+#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 23229f0c5b15..1acb584fc5f7 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -315,6 +315,11 @@ static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
return false;
}
+static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp)
+{
+ return sbp->sb_rblocks > 0;
+}
+
/*
* Detect a mismatched features2 field. Older kernels read/wrote
* this into the wrong slot, so to be safe we keep them in sync.
@@ -500,12 +505,12 @@ xfs_sb_has_incompat_log_feature(
/*
* V5 superblock specific feature checks
*/
-static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
-static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
@@ -518,7 +523,7 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
(sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
}
-static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
@@ -941,7 +946,7 @@ typedef enum xfs_dinode_fmt {
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
- XFS_DINODE_FMT_UUID /* uuid_t */
+ XFS_DINODE_FMT_UUID /* added long ago, but never used */
} xfs_dinode_fmt_t;
/*
@@ -1142,7 +1147,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* Dquot and dquot block format definitions
*/
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
-#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
+#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
/*
* This is the main portion of the on-disk representation of quota
@@ -1548,10 +1553,6 @@ typedef struct xfs_bmbt_rec {
typedef uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
-typedef struct xfs_bmbt_rec_host {
- uint64_t l0, l1;
-} xfs_bmbt_rec_host_t;
-
/*
* Values and macros for delayed-allocation startblock fields.
*/
@@ -1577,24 +1578,6 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
}
/*
- * Possible extent states.
- */
-typedef enum {
- XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
-} xfs_exntst_t;
-
-/*
- * Incore version of above.
- */
-typedef struct xfs_bmbt_irec
-{
- xfs_fileoff_t br_startoff; /* starting file offset */
- xfs_fsblock_t br_startblock; /* starting block number */
- xfs_filblks_t br_blockcount; /* number of blocks */
- xfs_exntst_t br_state; /* extent state */
-} xfs_bmbt_irec_t;
-
-/*
* Key structure for non-leaf levels of the tree.
*/
typedef struct xfs_bmbt_key {
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 8c61f21535d4..b90924104596 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -468,6 +468,82 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+/* metadata scrubbing */
+struct xfs_scrub_metadata {
+ __u32 sm_type; /* What to check? */
+ __u32 sm_flags; /* flags; see below. */
+ __u64 sm_ino; /* inode number. */
+ __u32 sm_gen; /* inode generation. */
+ __u32 sm_agno; /* ag number. */
+ __u64 sm_reserved[5]; /* pad to 64 bytes */
+};
+
+/*
+ * Metadata types and flags for scrub operation.
+ */
+
+/* Scrub subcommands. */
+#define XFS_SCRUB_TYPE_PROBE 0 /* presence test ioctl */
+#define XFS_SCRUB_TYPE_SB 1 /* superblock */
+#define XFS_SCRUB_TYPE_AGF 2 /* AG free header */
+#define XFS_SCRUB_TYPE_AGFL 3 /* AG free list */
+#define XFS_SCRUB_TYPE_AGI 4 /* AG inode header */
+#define XFS_SCRUB_TYPE_BNOBT 5 /* freesp by block btree */
+#define XFS_SCRUB_TYPE_CNTBT 6 /* freesp by length btree */
+#define XFS_SCRUB_TYPE_INOBT 7 /* inode btree */
+#define XFS_SCRUB_TYPE_FINOBT 8 /* free inode btree */
+#define XFS_SCRUB_TYPE_RMAPBT 9 /* reverse mapping btree */
+#define XFS_SCRUB_TYPE_REFCNTBT 10 /* reference count btree */
+#define XFS_SCRUB_TYPE_INODE 11 /* inode record */
+#define XFS_SCRUB_TYPE_BMBTD 12 /* data fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTA 13 /* attr fork block mapping */
+#define XFS_SCRUB_TYPE_BMBTC 14 /* CoW fork block mapping */
+#define XFS_SCRUB_TYPE_DIR 15 /* directory */
+#define XFS_SCRUB_TYPE_XATTR 16 /* extended attribute */
+#define XFS_SCRUB_TYPE_SYMLINK 17 /* symbolic link */
+#define XFS_SCRUB_TYPE_PARENT 18 /* parent pointers */
+#define XFS_SCRUB_TYPE_RTBITMAP 19 /* realtime bitmap */
+#define XFS_SCRUB_TYPE_RTSUM 20 /* realtime summary */
+#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */
+#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
+#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
+
+/* Number of scrub subcommands. */
+#define XFS_SCRUB_TYPE_NR 24
+
+/* i: Repair this metadata. */
+#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+
+/* o: Metadata object needs repair. */
+#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+
+/*
+ * o: Metadata object could be optimized. It's not corrupt, but
+ * we could improve on it somehow.
+ */
+#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+
+/* o: Cross-referencing failed. */
+#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+
+/* o: Metadata object disagrees with cross-referenced metadata. */
+#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+
+/* o: Scan was not complete. */
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+
+/* o: Metadata object looked funny but isn't corrupt. */
+#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+
+#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
+#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
+ XFS_SCRUB_OFLAG_PREEN | \
+ XFS_SCRUB_OFLAG_XFAIL | \
+ XFS_SCRUB_OFLAG_XCORRUPT | \
+ XFS_SCRUB_OFLAG_INCOMPLETE | \
+ XFS_SCRUB_OFLAG_WARNING)
+#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
+
/*
* ioctl limits
*/
@@ -511,6 +587,7 @@ typedef struct xfs_swapext
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/* XFS_IOC_GETFSMAP ------ hoisted 59 */
+#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 988bb3f31446..de3f04a98656 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -31,6 +31,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
#include "xfs_cksum.h"
@@ -1962,7 +1963,7 @@ xfs_difree_inobt(
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
- xic->deleted = 1;
+ xic->deleted = true;
xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
@@ -1989,7 +1990,7 @@ xfs_difree_inobt(
xfs_difree_inode_chunk(mp, agno, &rec, dfops);
} else {
- xic->deleted = 0;
+ xic->deleted = false;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -2664,3 +2665,93 @@ xfs_ialloc_pagi_init(
xfs_trans_brelse(tp, bp);
return 0;
}
+
+/* Calculate the first and last possible inode number in an AG. */
+void
+xfs_ialloc_agino_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ xfs_agblock_t bno;
+ xfs_agblock_t eoag;
+
+ eoag = xfs_ag_block_count(mp, agno);
+
+ /*
+ * Calculate the first inode, which will be in the first
+ * cluster-aligned block after the AGFL.
+ */
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1,
+ xfs_ialloc_cluster_alignment(mp));
+ *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0);
+
+ /*
+ * Calculate the last inode, which will be at the end of the
+ * last (aligned) cluster that can be allocated in the AG.
+ */
+ bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp));
+ *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+bool
+xfs_verify_agino(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ xfs_agino_t first;
+ xfs_agino_t last;
+
+ xfs_ialloc_agino_range(mp, agno, &first, &last);
+ return agino >= first && agino <= last;
+}
+
+/*
+ * Verify that an FS inode number pointer neither points outside the
+ * filesystem nor points at static AG metadata.
+ */
+bool
+xfs_verify_ino(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return false;
+ if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
+ return false;
+ return xfs_verify_agino(mp, agno, agino);
+}
+
+/* Is this an internal inode number? */
+bool
+xfs_internal_inum(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+ (xfs_sb_version_hasquota(&mp->m_sb) &&
+ xfs_is_quota_inode(&mp->m_sb, ino));
+}
+
+/*
+ * Verify that a directory entry's inode number doesn't point at an internal
+ * inode, empty space, or static AG metadata.
+ */
+bool
+xfs_verify_dir_ino(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ if (xfs_internal_inum(mp, ino))
+ return false;
+ return xfs_verify_ino(mp, ino);
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index b32cfb5aeb5b..d2bdcd5e7312 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -173,5 +173,12 @@ void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t *first, xfs_agino_t *last);
+bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
+bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
new file mode 100644
index 000000000000..19e546a41251
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+
+/*
+ * In-core extent record layout:
+ *
+ * +-------+----------------------------+
+ * | 00:53 | all 54 bits of startoff |
+ * | 54:63 | low 10 bits of startblock |
+ * +-------+----------------------------+
+ * | 00:20 | all 21 bits of length |
+ * | 21 | unwritten extent bit |
+ * | 22:63 | high 42 bits of startblock |
+ * +-------+----------------------------+
+ */
+#define XFS_IEXT_STARTOFF_MASK xfs_mask64lo(BMBT_STARTOFF_BITLEN)
+#define XFS_IEXT_LENGTH_MASK xfs_mask64lo(BMBT_BLOCKCOUNT_BITLEN)
+#define XFS_IEXT_STARTBLOCK_MASK xfs_mask64lo(BMBT_STARTBLOCK_BITLEN)
+
+struct xfs_iext_rec {
+ uint64_t lo;
+ uint64_t hi;
+};
+
+/*
+ * Given that the length can't be a zero, only an empty hi value indicates an
+ * unused record.
+ */
+static bool xfs_iext_rec_is_empty(struct xfs_iext_rec *rec)
+{
+ return rec->hi == 0;
+}
+
+static inline void xfs_iext_rec_clear(struct xfs_iext_rec *rec)
+{
+ rec->lo = 0;
+ rec->hi = 0;
+}
+
+static void
+xfs_iext_set(
+ struct xfs_iext_rec *rec,
+ struct xfs_bmbt_irec *irec)
+{
+ ASSERT((irec->br_startoff & ~XFS_IEXT_STARTOFF_MASK) == 0);
+ ASSERT((irec->br_blockcount & ~XFS_IEXT_LENGTH_MASK) == 0);
+ ASSERT((irec->br_startblock & ~XFS_IEXT_STARTBLOCK_MASK) == 0);
+
+ rec->lo = irec->br_startoff & XFS_IEXT_STARTOFF_MASK;
+ rec->hi = irec->br_blockcount & XFS_IEXT_LENGTH_MASK;
+
+ rec->lo |= (irec->br_startblock << 54);
+ rec->hi |= ((irec->br_startblock & ~xfs_mask64lo(10)) << (22 - 10));
+
+ if (irec->br_state == XFS_EXT_UNWRITTEN)
+ rec->hi |= (1 << 21);
+}
+
+static void
+xfs_iext_get(
+ struct xfs_bmbt_irec *irec,
+ struct xfs_iext_rec *rec)
+{
+ irec->br_startoff = rec->lo & XFS_IEXT_STARTOFF_MASK;
+ irec->br_blockcount = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+ irec->br_startblock = rec->lo >> 54;
+ irec->br_startblock |= (rec->hi & xfs_mask64hi(42)) >> (22 - 10);
+
+ if (rec->hi & (1 << 21))
+ irec->br_state = XFS_EXT_UNWRITTEN;
+ else
+ irec->br_state = XFS_EXT_NORM;
+}
+
+enum {
+ NODE_SIZE = 256,
+ KEYS_PER_NODE = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
+ RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
+ sizeof(struct xfs_iext_rec),
+};
+
+/*
+ * In-core extent btree block layout:
+ *
+ * There are two types of blocks in the btree: leaf and inner (non-leaf) blocks.
+ *
+ * The leaf blocks are made up by %KEYS_PER_NODE extent records, which each
+ * contain the startoffset, blockcount, startblock and unwritten extent flag.
+ * See above for the exact format, followed by pointers to the previous and next
+ * leaf blocks (if there are any).
+ *
+ * The inner (non-leaf) blocks first contain KEYS_PER_NODE lookup keys, followed
+ * by an equal number of pointers to the btree blocks at the next lower level.
+ *
+ * +-------+-------+-------+-------+-------+----------+----------+
+ * Leaf: | rec 1 | rec 2 | rec 3 | rec 4 | rec N | prev-ptr | next-ptr |
+ * +-------+-------+-------+-------+-------+----------+----------+
+ *
+ * +-------+-------+-------+-------+-------+-------+------+-------+
+ * Inner: | key 1 | key 2 | key 3 | key N | ptr 1 | ptr 2 | ptr3 | ptr N |
+ * +-------+-------+-------+-------+-------+-------+------+-------+
+ */
+struct xfs_iext_node {
+ uint64_t keys[KEYS_PER_NODE];
+#define XFS_IEXT_KEY_INVALID (1ULL << 63)
+ void *ptrs[KEYS_PER_NODE];
+};
+
+struct xfs_iext_leaf {
+ struct xfs_iext_rec recs[RECS_PER_LEAF];
+ struct xfs_iext_leaf *prev;
+ struct xfs_iext_leaf *next;
+};
+
+inline xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp)
+{
+ return ifp->if_bytes / sizeof(struct xfs_iext_rec);
+}
+
+static inline int xfs_iext_max_recs(struct xfs_ifork *ifp)
+{
+ if (ifp->if_height == 1)
+ return xfs_iext_count(ifp);
+ return RECS_PER_LEAF;
+}
+
+static inline struct xfs_iext_rec *cur_rec(struct xfs_iext_cursor *cur)
+{
+ return &cur->leaf->recs[cur->pos];
+}
+
+static inline bool xfs_iext_valid(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf)
+ return false;
+ if (cur->pos < 0 || cur->pos >= xfs_iext_max_recs(ifp))
+ return false;
+ if (xfs_iext_rec_is_empty(cur_rec(cur)))
+ return false;
+ return true;
+}
+
+static void *
+xfs_iext_find_first_leaf(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > 1; height--) {
+ node = node->ptrs[0];
+ ASSERT(node);
+ }
+
+ return node;
+}
+
+static void *
+xfs_iext_find_last_leaf(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > 1; height--) {
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ if (!node->ptrs[i])
+ break;
+ node = node->ptrs[i - 1];
+ ASSERT(node);
+ }
+
+ return node;
+}
+
+void
+xfs_iext_first(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ cur->pos = 0;
+ cur->leaf = xfs_iext_find_first_leaf(ifp);
+}
+
+void
+xfs_iext_last(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ int i;
+
+ cur->leaf = xfs_iext_find_last_leaf(ifp);
+ if (!cur->leaf) {
+ cur->pos = 0;
+ return;
+ }
+
+ for (i = 1; i < xfs_iext_max_recs(ifp); i++) {
+ if (xfs_iext_rec_is_empty(&cur->leaf->recs[i]))
+ break;
+ }
+ cur->pos = i - 1;
+}
+
+void
+xfs_iext_next(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf) {
+ ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+ xfs_iext_first(ifp, cur);
+ return;
+ }
+
+ ASSERT(cur->pos >= 0);
+ ASSERT(cur->pos < xfs_iext_max_recs(ifp));
+
+ cur->pos++;
+ if (ifp->if_height > 1 && !xfs_iext_valid(ifp, cur) &&
+ cur->leaf->next) {
+ cur->leaf = cur->leaf->next;
+ cur->pos = 0;
+ }
+}
+
+void
+xfs_iext_prev(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ if (!cur->leaf) {
+ ASSERT(cur->pos <= 0 || cur->pos >= RECS_PER_LEAF);
+ xfs_iext_last(ifp, cur);
+ return;
+ }
+
+ ASSERT(cur->pos >= 0);
+ ASSERT(cur->pos <= RECS_PER_LEAF);
+
+recurse:
+ do {
+ cur->pos--;
+ if (xfs_iext_valid(ifp, cur))
+ return;
+ } while (cur->pos > 0);
+
+ if (ifp->if_height > 1 && cur->leaf->prev) {
+ cur->leaf = cur->leaf->prev;
+ cur->pos = RECS_PER_LEAF;
+ goto recurse;
+ }
+}
+
+static inline int
+xfs_iext_key_cmp(
+ struct xfs_iext_node *node,
+ int n,
+ xfs_fileoff_t offset)
+{
+ if (node->keys[n] > offset)
+ return 1;
+ if (node->keys[n] < offset)
+ return -1;
+ return 0;
+}
+
+static inline int
+xfs_iext_rec_cmp(
+ struct xfs_iext_rec *rec,
+ xfs_fileoff_t offset)
+{
+ uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK;
+ uint32_t rec_len = rec->hi & XFS_IEXT_LENGTH_MASK;
+
+ if (rec_offset > offset)
+ return 1;
+ if (rec_offset + rec_len <= offset)
+ return -1;
+ return 0;
+}
+
+static void *
+xfs_iext_find_level(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ int level)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ if (!ifp->if_height)
+ return NULL;
+
+ for (height = ifp->if_height; height > level; height--) {
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ break;
+
+ node = node->ptrs[i - 1];
+ if (!node)
+ break;
+ }
+
+ return node;
+}
+
+static int
+xfs_iext_node_pos(
+ struct xfs_iext_node *node,
+ xfs_fileoff_t offset)
+{
+ int i;
+
+ for (i = 1; i < KEYS_PER_NODE; i++) {
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ break;
+ }
+
+ return i - 1;
+}
+
+static int
+xfs_iext_node_insert_pos(
+ struct xfs_iext_node *node,
+ xfs_fileoff_t offset)
+{
+ int i;
+
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (xfs_iext_key_cmp(node, i, offset) > 0)
+ return i;
+ }
+
+ return KEYS_PER_NODE;
+}
+
+static int
+xfs_iext_node_nr_entries(
+ struct xfs_iext_node *node,
+ int start)
+{
+ int i;
+
+ for (i = start; i < KEYS_PER_NODE; i++) {
+ if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+ break;
+ }
+
+ return i;
+}
+
+static int
+xfs_iext_leaf_nr_entries(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_leaf *leaf,
+ int start)
+{
+ int i;
+
+ for (i = start; i < xfs_iext_max_recs(ifp); i++) {
+ if (xfs_iext_rec_is_empty(&leaf->recs[i]))
+ break;
+ }
+
+ return i;
+}
+
+static inline uint64_t
+xfs_iext_leaf_key(
+ struct xfs_iext_leaf *leaf,
+ int n)
+{
+ return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
+}
+
+static void
+xfs_iext_grow(
+ struct xfs_ifork *ifp)
+{
+ struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ int i;
+
+ if (ifp->if_height == 1) {
+ struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+
+ node->keys[0] = xfs_iext_leaf_key(prev, 0);
+ node->ptrs[0] = prev;
+ } else {
+ struct xfs_iext_node *prev = ifp->if_u1.if_root;
+
+ ASSERT(ifp->if_height > 1);
+
+ node->keys[0] = prev->keys[0];
+ node->ptrs[0] = prev;
+ }
+
+ for (i = 1; i < KEYS_PER_NODE; i++)
+ node->keys[i] = XFS_IEXT_KEY_INVALID;
+
+ ifp->if_u1.if_root = node;
+ ifp->if_height++;
+}
+
+static void
+xfs_iext_update_node(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t old_offset,
+ xfs_fileoff_t new_offset,
+ int level,
+ void *ptr)
+{
+ struct xfs_iext_node *node = ifp->if_u1.if_root;
+ int height, i;
+
+ for (height = ifp->if_height; height > level; height--) {
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (i > 0 && xfs_iext_key_cmp(node, i, old_offset) > 0)
+ break;
+ if (node->keys[i] == old_offset)
+ node->keys[i] = new_offset;
+ }
+ node = node->ptrs[i - 1];
+ ASSERT(node);
+ }
+
+ ASSERT(node == ptr);
+}
+
+static struct xfs_iext_node *
+xfs_iext_split_node(
+ struct xfs_iext_node **nodep,
+ int *pos,
+ int *nr_entries)
+{
+ struct xfs_iext_node *node = *nodep;
+ struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ const int nr_move = KEYS_PER_NODE / 2;
+ int nr_keep = nr_move + (KEYS_PER_NODE & 1);
+ int i = 0;
+
+ /* for sequential append operations just spill over into the new node */
+ if (*pos == KEYS_PER_NODE) {
+ *nodep = new;
+ *pos = 0;
+ *nr_entries = 0;
+ goto done;
+ }
+
+
+ for (i = 0; i < nr_move; i++) {
+ new->keys[i] = node->keys[nr_keep + i];
+ new->ptrs[i] = node->ptrs[nr_keep + i];
+
+ node->keys[nr_keep + i] = XFS_IEXT_KEY_INVALID;
+ node->ptrs[nr_keep + i] = NULL;
+ }
+
+ if (*pos >= nr_keep) {
+ *nodep = new;
+ *pos -= nr_keep;
+ *nr_entries = nr_move;
+ } else {
+ *nr_entries = nr_keep;
+ }
+done:
+ for (; i < KEYS_PER_NODE; i++)
+ new->keys[i] = XFS_IEXT_KEY_INVALID;
+ return new;
+}
+
+static void
+xfs_iext_insert_node(
+ struct xfs_ifork *ifp,
+ uint64_t offset,
+ void *ptr,
+ int level)
+{
+ struct xfs_iext_node *node, *new;
+ int i, pos, nr_entries;
+
+again:
+ if (ifp->if_height < level)
+ xfs_iext_grow(ifp);
+
+ new = NULL;
+ node = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_insert_pos(node, offset);
+ nr_entries = xfs_iext_node_nr_entries(node, pos);
+
+ ASSERT(pos >= nr_entries || xfs_iext_key_cmp(node, pos, offset) != 0);
+ ASSERT(nr_entries <= KEYS_PER_NODE);
+
+ if (nr_entries == KEYS_PER_NODE)
+ new = xfs_iext_split_node(&node, &pos, &nr_entries);
+
+ /*
+ * Update the pointers in higher levels if the first entry changes
+ * in an existing node.
+ */
+ if (node != new && pos == 0 && nr_entries > 0)
+ xfs_iext_update_node(ifp, node->keys[0], offset, level, node);
+
+ for (i = nr_entries; i > pos; i--) {
+ node->keys[i] = node->keys[i - 1];
+ node->ptrs[i] = node->ptrs[i - 1];
+ }
+ node->keys[pos] = offset;
+ node->ptrs[pos] = ptr;
+
+ if (new) {
+ offset = new->keys[0];
+ ptr = new;
+ level++;
+ goto again;
+ }
+}
+
+static struct xfs_iext_leaf *
+xfs_iext_split_leaf(
+ struct xfs_iext_cursor *cur,
+ int *nr_entries)
+{
+ struct xfs_iext_leaf *leaf = cur->leaf;
+ struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+ const int nr_move = RECS_PER_LEAF / 2;
+ int nr_keep = nr_move + (RECS_PER_LEAF & 1);
+ int i;
+
+ /* for sequential append operations just spill over into the new node */
+ if (cur->pos == RECS_PER_LEAF) {
+ cur->leaf = new;
+ cur->pos = 0;
+ *nr_entries = 0;
+ goto done;
+ }
+
+ for (i = 0; i < nr_move; i++) {
+ new->recs[i] = leaf->recs[nr_keep + i];
+ xfs_iext_rec_clear(&leaf->recs[nr_keep + i]);
+ }
+
+ if (cur->pos >= nr_keep) {
+ cur->leaf = new;
+ cur->pos -= nr_keep;
+ *nr_entries = nr_move;
+ } else {
+ *nr_entries = nr_keep;
+ }
+done:
+ if (leaf->next)
+ leaf->next->prev = new;
+ new->next = leaf->next;
+ new->prev = leaf;
+ leaf->next = new;
+ return new;
+}
+
+static void
+xfs_iext_alloc_root(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ ASSERT(ifp->if_bytes == 0);
+
+ ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_height = 1;
+
+ /* now that we have a node step into it */
+ cur->leaf = ifp->if_u1.if_root;
+ cur->pos = 0;
+}
+
+static void
+xfs_iext_realloc_root(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur)
+{
+ size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
+ void *new;
+
+ /* account for the prev/next pointers */
+ if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
+ new_size = NODE_SIZE;
+
+ new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
+ memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
+ ifp->if_u1.if_root = new;
+ cur->leaf = new;
+}
+
+void
+xfs_iext_insert(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+ xfs_fileoff_t offset = irec->br_startoff;
+ struct xfs_iext_leaf *new = NULL;
+ int nr_entries, i;
+
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+
+ if (ifp->if_height == 0)
+ xfs_iext_alloc_root(ifp, cur);
+ else if (ifp->if_height == 1)
+ xfs_iext_realloc_root(ifp, cur);
+
+ nr_entries = xfs_iext_leaf_nr_entries(ifp, cur->leaf, cur->pos);
+ ASSERT(nr_entries <= RECS_PER_LEAF);
+ ASSERT(cur->pos >= nr_entries ||
+ xfs_iext_rec_cmp(cur_rec(cur), irec->br_startoff) != 0);
+
+ if (nr_entries == RECS_PER_LEAF)
+ new = xfs_iext_split_leaf(cur, &nr_entries);
+
+ /*
+ * Update the pointers in higher levels if the first entry changes
+ * in an existing node.
+ */
+ if (cur->leaf != new && cur->pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, xfs_iext_leaf_key(cur->leaf, 0),
+ offset, 1, cur->leaf);
+ }
+
+ for (i = nr_entries; i > cur->pos; i--)
+ cur->leaf->recs[i] = cur->leaf->recs[i - 1];
+ xfs_iext_set(cur_rec(cur), irec);
+ ifp->if_bytes += sizeof(struct xfs_iext_rec);
+
+ if (new)
+ xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
+}
+
+static struct xfs_iext_node *
+xfs_iext_rebalance_node(
+ struct xfs_iext_node *parent,
+ int *pos,
+ struct xfs_iext_node *node,
+ int nr_entries)
+{
+ /*
+ * If the neighbouring nodes are completely full, or have different
+ * parents, we might never be able to merge our node, and will only
+ * delete it once the number of entries hits zero.
+ */
+ if (nr_entries == 0)
+ return node;
+
+ if (*pos > 0) {
+ struct xfs_iext_node *prev = parent->ptrs[*pos - 1];
+ int nr_prev = xfs_iext_node_nr_entries(prev, 0), i;
+
+ if (nr_prev + nr_entries <= KEYS_PER_NODE) {
+ for (i = 0; i < nr_entries; i++) {
+ prev->keys[nr_prev + i] = node->keys[i];
+ prev->ptrs[nr_prev + i] = node->ptrs[i];
+ }
+ return node;
+ }
+ }
+
+ if (*pos + 1 < xfs_iext_node_nr_entries(parent, *pos)) {
+ struct xfs_iext_node *next = parent->ptrs[*pos + 1];
+ int nr_next = xfs_iext_node_nr_entries(next, 0), i;
+
+ if (nr_entries + nr_next <= KEYS_PER_NODE) {
+ /*
+ * Merge the next node into this node so that we don't
+ * have to do an additional update of the keys in the
+ * higher levels.
+ */
+ for (i = 0; i < nr_next; i++) {
+ node->keys[nr_entries + i] = next->keys[i];
+ node->ptrs[nr_entries + i] = next->ptrs[i];
+ }
+
+ ++*pos;
+ return next;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+xfs_iext_remove_node(
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ void *victim)
+{
+ struct xfs_iext_node *node, *parent;
+ int level = 2, pos, nr_entries, i;
+
+ ASSERT(level <= ifp->if_height);
+ node = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_pos(node, offset);
+again:
+ ASSERT(node->ptrs[pos]);
+ ASSERT(node->ptrs[pos] == victim);
+ kmem_free(victim);
+
+ nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
+ offset = node->keys[0];
+ for (i = pos; i < nr_entries; i++) {
+ node->keys[i] = node->keys[i + 1];
+ node->ptrs[i] = node->ptrs[i + 1];
+ }
+ node->keys[nr_entries] = XFS_IEXT_KEY_INVALID;
+ node->ptrs[nr_entries] = NULL;
+
+ if (pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, offset, node->keys[0], level, node);
+ offset = node->keys[0];
+ }
+
+ if (nr_entries >= KEYS_PER_NODE / 2)
+ return;
+
+ if (level < ifp->if_height) {
+ /*
+ * If we aren't at the root yet try to find a neighbour node to
+ * merge with (or delete the node if it is empty), and then
+ * recurse up to the next level.
+ */
+ level++;
+ parent = xfs_iext_find_level(ifp, offset, level);
+ pos = xfs_iext_node_pos(parent, offset);
+
+ ASSERT(pos != KEYS_PER_NODE);
+ ASSERT(parent->ptrs[pos] == node);
+
+ node = xfs_iext_rebalance_node(parent, &pos, node, nr_entries);
+ if (node) {
+ victim = node;
+ node = parent;
+ goto again;
+ }
+ } else if (nr_entries == 1) {
+ /*
+ * If we are at the root and only one entry is left we can just
+ * free this node and update the root pointer.
+ */
+ ASSERT(node == ifp->if_u1.if_root);
+ ifp->if_u1.if_root = node->ptrs[0];
+ ifp->if_height--;
+ kmem_free(node);
+ }
+}
+
+static void
+xfs_iext_rebalance_leaf(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_iext_leaf *leaf,
+ xfs_fileoff_t offset,
+ int nr_entries)
+{
+ /*
+ * If the neighbouring nodes are completely full we might never be able
+ * to merge our node, and will only delete it once the number of
+ * entries hits zero.
+ */
+ if (nr_entries == 0)
+ goto remove_node;
+
+ if (leaf->prev) {
+ int nr_prev = xfs_iext_leaf_nr_entries(ifp, leaf->prev, 0), i;
+
+ if (nr_prev + nr_entries <= RECS_PER_LEAF) {
+ for (i = 0; i < nr_entries; i++)
+ leaf->prev->recs[nr_prev + i] = leaf->recs[i];
+
+ if (cur->leaf == leaf) {
+ cur->leaf = leaf->prev;
+ cur->pos += nr_prev;
+ }
+ goto remove_node;
+ }
+ }
+
+ if (leaf->next) {
+ int nr_next = xfs_iext_leaf_nr_entries(ifp, leaf->next, 0), i;
+
+ if (nr_entries + nr_next <= RECS_PER_LEAF) {
+ /*
+ * Merge the next node into this node so that we don't
+ * have to do an additional update of the keys in the
+ * higher levels.
+ */
+ for (i = 0; i < nr_next; i++) {
+ leaf->recs[nr_entries + i] =
+ leaf->next->recs[i];
+ }
+
+ if (cur->leaf == leaf->next) {
+ cur->leaf = leaf;
+ cur->pos += nr_entries;
+ }
+
+ offset = xfs_iext_leaf_key(leaf->next, 0);
+ leaf = leaf->next;
+ goto remove_node;
+ }
+ }
+
+ return;
+remove_node:
+ if (leaf->prev)
+ leaf->prev->next = leaf->next;
+ if (leaf->next)
+ leaf->next->prev = leaf->prev;
+ xfs_iext_remove_node(ifp, offset, leaf);
+}
+
+static void
+xfs_iext_free_last_leaf(
+ struct xfs_ifork *ifp)
+{
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height--;
+ kmem_free(ifp->if_u1.if_root);
+}
+
+void
+xfs_iext_remove(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+ struct xfs_iext_leaf *leaf = cur->leaf;
+ xfs_fileoff_t offset = xfs_iext_leaf_key(leaf, 0);
+ int i, nr_entries;
+
+ trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
+
+ ASSERT(ifp->if_height > 0);
+ ASSERT(ifp->if_u1.if_root != NULL);
+ ASSERT(xfs_iext_valid(ifp, cur));
+
+ nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
+ for (i = cur->pos; i < nr_entries; i++)
+ leaf->recs[i] = leaf->recs[i + 1];
+ xfs_iext_rec_clear(&leaf->recs[nr_entries]);
+ ifp->if_bytes -= sizeof(struct xfs_iext_rec);
+
+ if (cur->pos == 0 && nr_entries > 0) {
+ xfs_iext_update_node(ifp, offset, xfs_iext_leaf_key(leaf, 0), 1,
+ leaf);
+ offset = xfs_iext_leaf_key(leaf, 0);
+ } else if (cur->pos == nr_entries) {
+ if (ifp->if_height > 1 && leaf->next)
+ cur->leaf = leaf->next;
+ else
+ cur->leaf = NULL;
+ cur->pos = 0;
+ }
+
+ if (nr_entries >= RECS_PER_LEAF / 2)
+ return;
+
+ if (ifp->if_height > 1)
+ xfs_iext_rebalance_leaf(ifp, cur, leaf, offset, nr_entries);
+ else if (nr_entries == 0)
+ xfs_iext_free_last_leaf(ifp);
+}
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent cursor in *cur.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its cursor in *cur
+ * instead.
+ * If bno is beyond the last extent return false, and return an invalid
+ * cursor value.
+ */
+bool
+xfs_iext_lookup_extent(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t offset,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+ cur->leaf = xfs_iext_find_level(ifp, offset, 1);
+ if (!cur->leaf) {
+ cur->pos = 0;
+ return false;
+ }
+
+ for (cur->pos = 0; cur->pos < xfs_iext_max_recs(ifp); cur->pos++) {
+ struct xfs_iext_rec *rec = cur_rec(cur);
+
+ if (xfs_iext_rec_is_empty(rec))
+ break;
+ if (xfs_iext_rec_cmp(rec, offset) >= 0)
+ goto found;
+ }
+
+ /* Try looking in the next node for an entry > offset */
+ if (ifp->if_height == 1 || !cur->leaf->next)
+ return false;
+ cur->leaf = cur->leaf->next;
+ cur->pos = 0;
+ if (!xfs_iext_valid(ifp, cur))
+ return false;
+found:
+ xfs_iext_get(gotp, cur_rec(cur));
+ return true;
+}
+
+/*
+ * Returns the last extent before end, and if this extent doesn't cover
+ * end, update end to the end of the extent.
+ */
+bool
+xfs_iext_lookup_extent_before(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t *end,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ /* could be optimized to not even look up the next on a match.. */
+ if (xfs_iext_lookup_extent(ip, ifp, *end - 1, cur, gotp) &&
+ gotp->br_startoff <= *end - 1)
+ return true;
+ if (!xfs_iext_prev_extent(ifp, cur, gotp))
+ return false;
+ *end = gotp->br_startoff + gotp->br_blockcount;
+ return true;
+}
+
+void
+xfs_iext_update_extent(
+ struct xfs_inode *ip,
+ int state,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *new)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+
+ if (cur->pos == 0) {
+ struct xfs_bmbt_irec old;
+
+ xfs_iext_get(&old, cur_rec(cur));
+ if (new->br_startoff != old.br_startoff) {
+ xfs_iext_update_node(ifp, old.br_startoff,
+ new->br_startoff, 1, cur->leaf);
+ }
+ }
+
+ trace_xfs_bmap_pre_update(ip, cur, state, _RET_IP_);
+ xfs_iext_set(cur_rec(cur), new);
+ trace_xfs_bmap_post_update(ip, cur, state, _RET_IP_);
+}
+
+/*
+ * Return true if the cursor points at an extent and return the extent structure
+ * in gotp. Else return false.
+ */
+bool
+xfs_iext_get_extent(
+ struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp)
+{
+ if (!xfs_iext_valid(ifp, cur))
+ return false;
+ xfs_iext_get(gotp, cur_rec(cur));
+ return true;
+}
+
+/*
+ * This is a recursive function, because of that we need to be extremely
+ * careful with stack usage.
+ */
+static void
+xfs_iext_destroy_node(
+ struct xfs_iext_node *node,
+ int level)
+{
+ int i;
+
+ if (level > 1) {
+ for (i = 0; i < KEYS_PER_NODE; i++) {
+ if (node->keys[i] == XFS_IEXT_KEY_INVALID)
+ break;
+ xfs_iext_destroy_node(node->ptrs[i], level - 1);
+ }
+ }
+
+ kmem_free(node);
+}
+
+void
+xfs_iext_destroy(
+ struct xfs_ifork *ifp)
+{
+ xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+
+ ifp->if_bytes = 0;
+ ifp->if_height = 0;
+ ifp->if_u1.if_root = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 378f8fbc91a7..6b7989038d75 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -24,6 +24,7 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_icache.h"
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 31840ca24018..1c90ec41e9df 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,21 +42,27 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
+{
+ return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+}
+
/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode. For fifos, devs, and sockets
- * this means set if_rdev to the proper value. For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers. For a file in B-tree format, only the root is immediately
- * brought in-core. The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
+ * Copy inode type and data and attr format specific information from the
+ * on-disk inode to the in-core inode and fork structures. For fifos, devices,
+ * and sockets this means set i_rdev to the proper value. For files,
+ * directories, and symlinks this means to bring in the in-line data or extent
+ * pointers as well as the attribute fork. For a fork in B-tree format, only
+ * the root is immediately brought in-core. The rest will be read in later when
+ * first referenced (see xfs_iread_extents()).
*/
int
xfs_iformat_fork(
- xfs_inode_t *ip,
- xfs_dinode_t *dip)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
{
- xfs_attr_shortform_t *atp;
+ struct inode *inode = VFS_I(ip);
+ struct xfs_attr_shortform *atp;
int size;
int error = 0;
xfs_fsize_t di_size;
@@ -95,8 +101,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
- if (unlikely(xfs_is_reflink_inode(ip) &&
- (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) {
+ if (unlikely(xfs_is_reflink_inode(ip) && !S_ISREG(inode->i_mode))) {
xfs_warn(ip->i_mount,
"corrupt dinode %llu, wrong file type for reflink.",
ip->i_ino);
@@ -115,7 +120,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
- switch (VFS_I(ip)->i_mode & S_IFMT) {
+ switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
@@ -126,7 +131,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
ip->i_d.di_size = 0;
- ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+ inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
break;
case S_IFREG:
@@ -184,8 +189,7 @@ xfs_iformat_fork(
return error;
/* Check inline dir contents. */
- if (S_ISDIR(VFS_I(ip)->i_mode) &&
- dip->di_format == XFS_DINODE_FMT_LOCAL) {
+ if (S_ISDIR(inode->i_mode) && dip->di_format == XFS_DINODE_FMT_LOCAL) {
error = xfs_dir2_sf_verify(ip);
if (error) {
xfs_idestroy_fork(ip, XFS_DATA_FORK);
@@ -265,19 +269,14 @@ xfs_init_local_fork(
if (zero_terminate)
mem_size++;
- if (size == 0)
- ifp->if_u1.if_data = NULL;
- else if (mem_size <= sizeof(ifp->if_u2.if_inline_data))
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- else {
+ if (size) {
real_size = roundup(mem_size, 4);
ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
- }
-
- if (size) {
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
+ } else {
+ ifp->if_u1.if_data = NULL;
}
ifp->if_bytes = size;
@@ -288,13 +287,6 @@ xfs_init_local_fork(
/*
* The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there. Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
*/
STATIC int
xfs_iformat_local(
@@ -324,9 +316,7 @@ xfs_iformat_local(
/*
* The file consists of a set of extents all of which fit into the on-disk
- * inode. If there are few enough extents to fit into the if_inline_ext, then
- * copy them there. Otherwise allocate a buffer for them and copy them into it.
- * Either way, set if_extents to point at the extents.
+ * inode.
*/
STATIC int
xfs_iformat_extents(
@@ -336,9 +326,12 @@ xfs_iformat_extents(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int state = xfs_bmap_fork_to_state(whichfork);
int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
+ struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
+ struct xfs_bmbt_irec new;
int i;
/*
@@ -354,27 +347,25 @@ xfs_iformat_extents(
}
ifp->if_real_bytes = 0;
- if (nex == 0)
- ifp->if_u1.if_extents = NULL;
- else if (nex <= XFS_INLINE_EXTS)
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- else
- xfs_iext_add(ifp, 0, nex);
-
- ifp->if_bytes = size;
+ ifp->if_bytes = 0;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+
+ xfs_iext_first(ifp, &icur);
for (i = 0; i < nex; i++, dp++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- ep->l0 = get_unaligned_be64(&dp->l0);
- ep->l1 = get_unaligned_be64(&dp->l1);
- if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) {
+ xfs_bmbt_disk_get_all(dp, &new);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) {
XFS_ERROR_REPORT("xfs_iformat_extents(2)",
XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
+
+ xfs_iext_insert(ip, &icur, &new, state);
+ trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
+ xfs_iext_next(ifp, &icur);
}
- XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
}
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
@@ -440,47 +431,14 @@ xfs_iformat_btree(
ifp->if_flags &= ~XFS_IFEXTENTS;
ifp->if_flags |= XFS_IFBROOT;
+ ifp->if_real_bytes = 0;
+ ifp->if_bytes = 0;
+ ifp->if_u1.if_root = NULL;
+ ifp->if_height = 0;
return 0;
}
/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- int whichfork)
-{
- int error;
- xfs_ifork_t *ifp;
- xfs_extnum_t nextents;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
- nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
- ifp = XFS_IFORK_PTR(ip, whichfork);
-
- /*
- * We know that the size is valid (it's checked in iformat_btree)
- */
- ifp->if_bytes = ifp->if_real_bytes = 0;
- xfs_iext_add(ifp, 0, nextents);
- error = xfs_bmap_read_extents(tp, ip, whichfork);
- if (error) {
- xfs_iext_destroy(ifp);
- return error;
- }
- ifp->if_flags |= XFS_IFEXTENTS;
- return 0;
-}
-/*
* Reallocate the space for if_broot based on the number of records
* being added or deleted as indicated in rec_diff. Move the records
* and pointers in if_broot to fit the new size. When shrinking this
@@ -644,26 +602,9 @@ xfs_idata_realloc(
ASSERT(new_size >= 0);
if (new_size == 0) {
- if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- kmem_free(ifp->if_u1.if_data);
- }
+ kmem_free(ifp->if_u1.if_data);
ifp->if_u1.if_data = NULL;
real_size = 0;
- } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
- /*
- * If the valid extents/data can fit in if_inline_ext/data,
- * copy them from the malloc'd vector and free it.
- */
- if (ifp->if_u1.if_data == NULL) {
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- ASSERT(ifp->if_real_bytes != 0);
- memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
- new_size);
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
- }
- real_size = 0;
} else {
/*
* Stuck with malloc/realloc.
@@ -677,7 +618,7 @@ xfs_idata_realloc(
ASSERT(ifp->if_real_bytes == 0);
ifp->if_u1.if_data = kmem_alloc(real_size,
KM_SLEEP | KM_NOFS);
- } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ } else {
/*
* Only do the realloc if the underlying size
* is really changing.
@@ -688,12 +629,6 @@ xfs_idata_realloc(
real_size,
KM_SLEEP | KM_NOFS);
}
- } else {
- ASSERT(ifp->if_real_bytes == 0);
- ifp->if_u1.if_data = kmem_alloc(real_size,
- KM_SLEEP | KM_NOFS);
- memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
- ifp->if_bytes);
}
}
ifp->if_real_bytes = real_size;
@@ -721,23 +656,18 @@ xfs_idestroy_fork(
* so check and free it up if we do.
*/
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
- (ifp->if_u1.if_data != NULL)) {
+ if (ifp->if_u1.if_data != NULL) {
ASSERT(ifp->if_real_bytes != 0);
kmem_free(ifp->if_u1.if_data);
ifp->if_u1.if_data = NULL;
ifp->if_real_bytes = 0;
}
- } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
- ((ifp->if_flags & XFS_IFEXTIREC) ||
- ((ifp->if_u1.if_extents != NULL) &&
- (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
- ASSERT(ifp->if_real_bytes != 0);
+ } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) {
xfs_iext_destroy(ifp);
}
- ASSERT(ifp->if_u1.if_extents == NULL ||
- ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+
ASSERT(ifp->if_real_bytes == 0);
+
if (whichfork == XFS_ATTR_FORK) {
kmem_zone_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
@@ -747,19 +677,9 @@ xfs_idestroy_fork(
}
}
-/* Count number of incore extents based on if_bytes */
-xfs_extnum_t
-xfs_iext_count(struct xfs_ifork *ifp)
-{
- return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-}
-
/*
* Convert in-core extents to on-disk form
*
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode.
- *
* In the case of the data fork, the in-core and on-disk fork sizes can be
* different due to delayed allocation extents. We only copy on-disk extents
* here, so callers must always use the physical fork size to determine the
@@ -768,53 +688,32 @@ xfs_iext_count(struct xfs_ifork *ifp)
*/
int
xfs_iextents_copy(
- xfs_inode_t *ip,
- xfs_bmbt_rec_t *dp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_rec *dp,
int whichfork)
{
- int copied;
- int i;
- xfs_ifork_t *ifp;
- int nrecs;
- xfs_fsblock_t start_block;
+ int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec rec;
+ int copied = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(ifp->if_bytes > 0);
- nrecs = xfs_iext_count(ifp);
- XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
- ASSERT(nrecs > 0);
-
- /*
- * There are some delayed allocation extents in the
- * inode, so copy the extents one at a time and skip
- * the delayed ones. There must be at least one
- * non-delayed extent.
- */
- copied = 0;
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-
- ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep));
-
- start_block = xfs_bmbt_get_startblock(ep);
- if (isnullstartblock(start_block)) {
- /*
- * It's a delayed allocation extent, so skip it.
- */
+ for_each_xfs_iext(ifp, &icur, &rec) {
+ if (isnullstartblock(rec.br_startblock))
continue;
- }
-
- /* Translate to on disk format */
- put_unaligned_be64(ep->l0, &dp->l0);
- put_unaligned_be64(ep->l1, &dp->l1);
+ ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec));
+ xfs_bmbt_disk_set_all(dp, &rec);
+ trace_xfs_write_extent(ip, &icur, state, _RET_IP_);
+ copied += sizeof(struct xfs_bmbt_rec);
dp++;
- copied++;
}
- ASSERT(copied != 0);
- return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+ ASSERT(copied > 0);
+ ASSERT(copied <= ifp->if_bytes);
+ return copied;
}
/*
@@ -872,7 +771,6 @@ xfs_iflush_fork(
!(iip->ili_fields & extflag[whichfork]));
if ((iip->ili_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(xfs_iext_get_ext(ifp, 0));
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
whichfork);
@@ -894,16 +792,7 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_DEV:
if (iip->ili_fields & XFS_ILOG_DEV) {
ASSERT(whichfork == XFS_DATA_FORK);
- xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
- }
- break;
-
- case XFS_DINODE_FMT_UUID:
- if (iip->ili_fields & XFS_ILOG_UUID) {
- ASSERT(whichfork == XFS_DATA_FORK);
- memcpy(XFS_DFORK_DPTR(dip),
- &ip->i_df.if_u2.if_uuid,
- sizeof(uuid_t));
+ xfs_dinode_put_rdev(dip, sysv_encode_dev(VFS_I(ip)->i_rdev));
}
break;
@@ -913,33 +802,6 @@ xfs_iflush_fork(
}
}
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx) /* index of target extent */
-{
- ASSERT(idx >= 0);
- ASSERT(idx < xfs_iext_count(ifp));
-
- if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
- return ifp->if_u1.if_ext_irec->er_extbuf;
- } else if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_ext_irec_t *erp; /* irec pointer */
- int erp_idx = 0; /* irec index */
- xfs_extnum_t page_idx = idx; /* ext index in target list */
-
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
- return &erp->er_extbuf[page_idx];
- } else if (ifp->if_bytes) {
- return &ifp->if_u1.if_extents[idx];
- } else {
- return NULL;
- }
-}
-
/* Convert bmap state flags to an inode fork. */
struct xfs_ifork *
xfs_iext_state_to_fork(
@@ -954,1011 +816,6 @@ xfs_iext_state_to_fork(
}
/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'. 'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* starting index of new items */
- xfs_extnum_t count, /* number of inserted items */
- xfs_bmbt_irec_t *new, /* items to insert */
- int state) /* type of extent conversion */
-{
- xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
- xfs_extnum_t i; /* extent record index */
-
- trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- xfs_iext_add(ifp, idx, count);
- for (i = idx; i < idx + count; i++, new++)
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin adding exts */
- int ext_diff) /* number of extents to add */
-{
- int byte_diff; /* new bytes being added */
- int new_size; /* size of extents after adding */
- xfs_extnum_t nextents; /* number of extents in file */
-
- nextents = xfs_iext_count(ifp);
- ASSERT((idx >= 0) && (idx <= nextents));
- byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
- new_size = ifp->if_bytes + byte_diff;
- /*
- * If the new number of extents (nextents + ext_diff)
- * fits inside the inode, then continue to use the inline
- * extent buffer.
- */
- if (nextents + ext_diff <= XFS_INLINE_EXTS) {
- if (idx < nextents) {
- memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
- &ifp->if_u2.if_inline_ext[idx],
- (nextents - idx) * sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
- }
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- ifp->if_real_bytes = 0;
- }
- /*
- * Otherwise use a linear (direct) extent list.
- * If the extents are currently inside the inode,
- * xfs_iext_realloc_direct will switch us from
- * inline to direct extent allocation mode.
- */
- else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
- xfs_iext_realloc_direct(ifp, new_size);
- if (idx < nextents) {
- memmove(&ifp->if_u1.if_extents[idx + ext_diff],
- &ifp->if_u1.if_extents[idx],
- (nextents - idx) * sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
- }
- }
- /* Indirection array */
- else {
- xfs_ext_irec_t *erp;
- int erp_idx = 0;
- int page_idx = idx;
-
- ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
- if (ifp->if_flags & XFS_IFEXTIREC) {
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
- } else {
- xfs_iext_irec_init(ifp);
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = ifp->if_u1.if_ext_irec;
- }
- /* Extents fit in target extent page */
- if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
- if (page_idx < erp->er_extcount) {
- memmove(&erp->er_extbuf[page_idx + ext_diff],
- &erp->er_extbuf[page_idx],
- (erp->er_extcount - page_idx) *
- sizeof(xfs_bmbt_rec_t));
- memset(&erp->er_extbuf[page_idx], 0, byte_diff);
- }
- erp->er_extcount += ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- }
- /* Insert a new extent page */
- else if (erp) {
- xfs_iext_add_indirect_multi(ifp,
- erp_idx, page_idx, ext_diff);
- }
- /*
- * If extent(s) are being appended to the last page in
- * the indirection array and the new extent(s) don't fit
- * in the page, then erp is NULL and erp_idx is set to
- * the next index needed in the indirection array.
- */
- else {
- uint count = ext_diff;
-
- while (count) {
- erp = xfs_iext_irec_new(ifp, erp_idx);
- erp->er_extcount = min(count, XFS_LINEAR_EXTS);
- count -= erp->er_extcount;
- if (count)
- erp_idx++;
- }
- }
- }
- ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- * |-------| |-------|
- * | | | | idx - number of extents before idx
- * | idx | | count |
- * | | | | count - number of extents being inserted at idx
- * |-------| |-------|
- * | count | | nex2 | nex2 - number of extents after idx + count
- * |-------| |-------|
- */
-void
-xfs_iext_add_indirect_multi(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx, /* target extent irec index */
- xfs_extnum_t idx, /* index within target list */
- int count) /* new extents being added */
-{
- int byte_diff; /* new bytes being added */
- xfs_ext_irec_t *erp; /* pointer to irec entry */
- xfs_extnum_t ext_diff; /* number of extents to add */
- xfs_extnum_t ext_cnt; /* new extents still needed */
- xfs_extnum_t nex2; /* extents after idx + count */
- xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
- int nlists; /* number of irec's (lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- nex2 = erp->er_extcount - idx;
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
- /*
- * Save second part of target extent list
- * (all extents past */
- if (nex2) {
- byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
- memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
- erp->er_extcount -= nex2;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
- memset(&erp->er_extbuf[idx], 0, byte_diff);
- }
-
- /*
- * Add the new extents to the end of the target
- * list, then allocate new irec record(s) and
- * extent buffer(s) as needed to store the rest
- * of the new extents.
- */
- ext_cnt = count;
- ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
- if (ext_diff) {
- erp->er_extcount += ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- ext_cnt -= ext_diff;
- }
- while (ext_cnt) {
- erp_idx++;
- erp = xfs_iext_irec_new(ifp, erp_idx);
- ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
- erp->er_extcount = ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
- ext_cnt -= ext_diff;
- }
-
- /* Add nex2 extents back to indirection array */
- if (nex2) {
- xfs_extnum_t ext_avail;
- int i;
-
- byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
- i = 0;
- /*
- * If nex2 extents fit in the current page, append
- * nex2_ep after the new extents.
- */
- if (nex2 <= ext_avail) {
- i = erp->er_extcount;
- }
- /*
- * Otherwise, check if space is available in the
- * next page.
- */
- else if ((erp_idx < nlists - 1) &&
- (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
- ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
- erp_idx++;
- erp++;
- /* Create a hole for nex2 extents */
- memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
- erp->er_extcount * sizeof(xfs_bmbt_rec_t));
- }
- /*
- * Final choice, create a new extent page for
- * nex2 extents.
- */
- else {
- erp_idx++;
- erp = xfs_iext_irec_new(ifp, erp_idx);
- }
- memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
- kmem_free(nex2_ep);
- erp->er_extcount += nex2;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
- }
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array. Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff, /* number of extents to remove */
- int state) /* type of extent conversion */
-{
- xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state);
- xfs_extnum_t nextents; /* number of extents in file */
- int new_size; /* size of extents after removal */
-
- trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
- ASSERT(ext_diff > 0);
- nextents = xfs_iext_count(ifp);
- new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- } else if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_iext_remove_indirect(ifp, idx, ext_diff);
- } else if (ifp->if_real_bytes) {
- xfs_iext_remove_direct(ifp, idx, ext_diff);
- } else {
- xfs_iext_remove_inline(ifp, idx, ext_diff);
- }
- ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff) /* number of extents to remove */
-{
- int nextents; /* number of extents in file */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- ASSERT(idx < XFS_INLINE_EXTS);
- nextents = xfs_iext_count(ifp);
- ASSERT(((nextents - ext_diff) > 0) &&
- (nextents - ext_diff) < XFS_INLINE_EXTS);
-
- if (idx + ext_diff < nextents) {
- memmove(&ifp->if_u2.if_inline_ext[idx],
- &ifp->if_u2.if_inline_ext[idx + ext_diff],
- (nextents - (idx + ext_diff)) *
- sizeof(xfs_bmbt_rec_t));
- memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
- 0, ext_diff * sizeof(xfs_bmbt_rec_t));
- } else {
- memset(&ifp->if_u2.if_inline_ext[idx], 0,
- ext_diff * sizeof(xfs_bmbt_rec_t));
- }
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing exts */
- int ext_diff) /* number of extents to remove */
-{
- xfs_extnum_t nextents; /* number of extents in file */
- int new_size; /* size of extents after removal */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- new_size = ifp->if_bytes -
- (ext_diff * sizeof(xfs_bmbt_rec_t));
- nextents = xfs_iext_count(ifp);
-
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- return;
- }
- /* Move extents up in the list (if needed) */
- if (idx + ext_diff < nextents) {
- memmove(&ifp->if_u1.if_extents[idx],
- &ifp->if_u1.if_extents[idx + ext_diff],
- (nextents - (idx + ext_diff)) *
- sizeof(xfs_bmbt_rec_t));
- }
- memset(&ifp->if_u1.if_extents[nextents - ext_diff],
- 0, ext_diff * sizeof(xfs_bmbt_rec_t));
- /*
- * Reallocate the direct extent list. If the extents
- * will fit inside the inode then xfs_iext_realloc_direct
- * will switch from direct to inline extent allocation
- * mode for us.
- */
- xfs_iext_realloc_direct(ifp, new_size);
- ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- * |-------| |-------|
- * | nex1 | | | nex1 - number of extents before idx
- * |-------| | count |
- * | | | | count - number of extents being removed at idx
- * | count | |-------|
- * | | | nex2 | nex2 - number of extents after idx + count
- * |-------| |-------|
- */
-void
-xfs_iext_remove_indirect(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t idx, /* index to begin removing extents */
- int count) /* number of extents to remove */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int erp_idx = 0; /* indirection array index */
- xfs_extnum_t ext_cnt; /* extents left to remove */
- xfs_extnum_t ext_diff; /* extents to remove in current list */
- xfs_extnum_t nex1; /* number of extents before idx */
- xfs_extnum_t nex2; /* extents after idx + count */
- int page_idx = idx; /* index in target extent list */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
- ASSERT(erp != NULL);
- nex1 = page_idx;
- ext_cnt = count;
- while (ext_cnt) {
- nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
- ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
- /*
- * Check for deletion of entire list;
- * xfs_iext_irec_remove() updates extent offsets.
- */
- if (ext_diff == erp->er_extcount) {
- xfs_iext_irec_remove(ifp, erp_idx);
- ext_cnt -= ext_diff;
- nex1 = 0;
- if (ext_cnt) {
- ASSERT(erp_idx < ifp->if_real_bytes /
- XFS_IEXT_BUFSZ);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- nex1 = 0;
- continue;
- } else {
- break;
- }
- }
- /* Move extents up (if needed) */
- if (nex2) {
- memmove(&erp->er_extbuf[nex1],
- &erp->er_extbuf[nex1 + ext_diff],
- nex2 * sizeof(xfs_bmbt_rec_t));
- }
- /* Zero out rest of page */
- memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
- ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
- /* Update remaining counters */
- erp->er_extcount -= ext_diff;
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
- ext_cnt -= ext_diff;
- nex1 = 0;
- erp_idx++;
- erp++;
- }
- ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
- xfs_iext_irec_compact(ifp);
-}
-
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new size of extents after adding */
-{
- int rnew_size; /* real new size of extents */
-
- rnew_size = new_size;
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
- ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
- (new_size != ifp->if_real_bytes)));
-
- /* Free extent records */
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- }
- /* Resize direct extent list and zero any new bytes */
- else if (ifp->if_real_bytes) {
- /* Check if extents will fit inside the inode */
- if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
- xfs_iext_direct_to_inline(ifp, new_size /
- (uint)sizeof(xfs_bmbt_rec_t));
- ifp->if_bytes = new_size;
- return;
- }
- if (!is_power_of_2(new_size)){
- rnew_size = roundup_pow_of_two(new_size);
- }
- if (rnew_size != ifp->if_real_bytes) {
- ifp->if_u1.if_extents =
- kmem_realloc(ifp->if_u1.if_extents,
- rnew_size, KM_NOFS);
- }
- if (rnew_size > ifp->if_real_bytes) {
- memset(&ifp->if_u1.if_extents[ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)], 0,
- rnew_size - ifp->if_real_bytes);
- }
- }
- /* Switch from the inline extent buffer to a direct extent list */
- else {
- if (!is_power_of_2(new_size)) {
- rnew_size = roundup_pow_of_two(new_size);
- }
- xfs_iext_inline_to_direct(ifp, rnew_size);
- }
- ifp->if_real_bytes = rnew_size;
- ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t nextents) /* number of extents in file */
-{
- ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(nextents <= XFS_INLINE_EXTS);
- /*
- * The inline buffer was zeroed when we switched
- * from inline to direct extent allocation mode,
- * so we don't need to clear it here.
- */
- memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
- nextents * sizeof(xfs_bmbt_rec_t));
- kmem_free(ifp->if_u1.if_extents);
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
- ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* number of extents in file */
-{
- ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
- memset(ifp->if_u1.if_extents, 0, new_size);
- if (ifp->if_bytes) {
- memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
- ifp->if_bytes);
- memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
- sizeof(xfs_bmbt_rec_t));
- }
- ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new indirection array size */
-{
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(ifp->if_real_bytes);
- ASSERT((new_size >= 0) &&
- (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
- sizeof(xfs_ext_irec_t))));
- if (new_size == 0) {
- xfs_iext_destroy(ifp);
- } else {
- ifp->if_u1.if_ext_irec =
- kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS);
- }
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t nextents; /* number of extents in file */
- int size; /* size of file extents */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nextents = xfs_iext_count(ifp);
- ASSERT(nextents <= XFS_LINEAR_EXTS);
- size = nextents * sizeof(xfs_bmbt_rec_t);
-
- xfs_iext_irec_compact_pages(ifp);
- ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
- ep = ifp->if_u1.if_ext_irec->er_extbuf;
- kmem_free(ifp->if_u1.if_ext_irec);
- ifp->if_flags &= ~XFS_IFEXTIREC;
- ifp->if_u1.if_extents = ep;
- ifp->if_bytes = size;
- if (nextents < XFS_LINEAR_EXTS) {
- xfs_iext_realloc_direct(ifp, size);
- }
-}
-
-/*
- * Remove all records from the indirection array.
- */
-STATIC void
-xfs_iext_irec_remove_all(
- struct xfs_ifork *ifp)
-{
- int nlists;
- int i;
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (i = 0; i < nlists; i++)
- kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf);
- kmem_free(ifp->if_u1.if_ext_irec);
- ifp->if_flags &= ~XFS_IFEXTIREC;
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- if (ifp->if_flags & XFS_IFEXTIREC) {
- xfs_iext_irec_remove_all(ifp);
- } else if (ifp->if_real_bytes) {
- kmem_free(ifp->if_u1.if_extents);
- } else if (ifp->if_bytes) {
- memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
- sizeof(xfs_bmbt_rec_t));
- }
- ifp->if_u1.if_extents = NULL;
- ifp->if_real_bytes = 0;
- ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t * /* pointer to found extent record */
-xfs_iext_bno_to_ext(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number to search for */
- xfs_extnum_t *idxp) /* index of target extent */
-{
- xfs_bmbt_rec_host_t *base; /* pointer to first extent */
- xfs_filblks_t blockcount = 0; /* number of blocks in extent */
- xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
- xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
- int high; /* upper boundary in search */
- xfs_extnum_t idx = 0; /* index of target extent */
- int low; /* lower boundary in search */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_fileoff_t startoff = 0; /* start offset of extent */
-
- nextents = xfs_iext_count(ifp);
- if (nextents == 0) {
- *idxp = 0;
- return NULL;
- }
- low = 0;
- if (ifp->if_flags & XFS_IFEXTIREC) {
- /* Find target extent list */
- int erp_idx = 0;
- erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
- base = erp->er_extbuf;
- high = erp->er_extcount - 1;
- } else {
- base = ifp->if_u1.if_extents;
- high = nextents - 1;
- }
- /* Binary search extent records */
- while (low <= high) {
- idx = (low + high) >> 1;
- ep = base + idx;
- startoff = xfs_bmbt_get_startoff(ep);
- blockcount = xfs_bmbt_get_blockcount(ep);
- if (bno < startoff) {
- high = idx - 1;
- } else if (bno >= startoff + blockcount) {
- low = idx + 1;
- } else {
- /* Convert back to file-based extent index */
- if (ifp->if_flags & XFS_IFEXTIREC) {
- idx += erp->er_extoff;
- }
- *idxp = idx;
- return ep;
- }
- }
- /* Convert back to file-based extent index */
- if (ifp->if_flags & XFS_IFEXTIREC) {
- idx += erp->er_extoff;
- }
- if (bno >= startoff + blockcount) {
- if (++idx == nextents) {
- ep = NULL;
- } else {
- ep = xfs_iext_get_ext(ifp, idx);
- }
- }
- *idxp = idx;
- return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t * /* pointer to found extent record */
-xfs_iext_bno_to_irec(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number to search for */
- int *erp_idxp) /* irec index of target ext list */
-{
- xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
- xfs_ext_irec_t *erp_next; /* next indirection array entry */
- int erp_idx; /* indirection array index */
- int nlists; /* number of extent irec's (lists) */
- int high; /* binary search upper limit */
- int low; /* binary search lower limit */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp_idx = 0;
- low = 0;
- high = nlists - 1;
- while (low <= high) {
- erp_idx = (low + high) >> 1;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
- if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
- high = erp_idx - 1;
- } else if (erp_next && bno >=
- xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
- low = erp_idx + 1;
- } else {
- break;
- }
- }
- *erp_idxp = erp_idx;
- return erp;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_extnum_t *idxp, /* extent index (file -> page) */
- int *erp_idxp, /* pointer to target irec */
- int realloc) /* new bytes were just added */
-{
- xfs_ext_irec_t *prev; /* pointer to previous irec */
- xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
- int erp_idx; /* indirection array index */
- int nlists; /* number of irec's (ex lists) */
- int high; /* binary search upper limit */
- int low; /* binary search lower limit */
- xfs_extnum_t page_idx = *idxp; /* extent index in target list */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(page_idx >= 0);
- ASSERT(page_idx <= xfs_iext_count(ifp));
- ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
-
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp_idx = 0;
- low = 0;
- high = nlists - 1;
-
- /* Binary search extent irec's */
- while (low <= high) {
- erp_idx = (low + high) >> 1;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- prev = erp_idx > 0 ? erp - 1 : NULL;
- if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
- realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
- high = erp_idx - 1;
- } else if (page_idx > erp->er_extoff + erp->er_extcount ||
- (page_idx == erp->er_extoff + erp->er_extcount &&
- !realloc)) {
- low = erp_idx + 1;
- } else if (page_idx == erp->er_extoff + erp->er_extcount &&
- erp->er_extcount == XFS_LINEAR_EXTS) {
- ASSERT(realloc);
- page_idx = 0;
- erp_idx++;
- erp = erp_idx < nlists ? erp + 1 : NULL;
- break;
- } else {
- page_idx -= erp->er_extoff;
- break;
- }
- }
- *idxp = page_idx;
- *erp_idxp = erp_idx;
- return erp;
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- xfs_extnum_t nextents; /* number of extents in file */
-
- ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- nextents = xfs_iext_count(ifp);
- ASSERT(nextents <= XFS_LINEAR_EXTS);
-
- erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
- if (nextents == 0) {
- ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
- } else if (!ifp->if_real_bytes) {
- xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
- } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
- xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
- }
- erp->er_extbuf = ifp->if_u1.if_extents;
- erp->er_extcount = nextents;
- erp->er_extoff = 0;
-
- ifp->if_flags |= XFS_IFEXTIREC;
- ifp->if_real_bytes = XFS_IEXT_BUFSZ;
- ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
- ifp->if_u1.if_ext_irec = erp;
-
- return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx) /* index for new irec */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
- /* Resize indirection array */
- xfs_iext_realloc_indirect(ifp, ++nlists *
- sizeof(xfs_ext_irec_t));
- /*
- * Move records down in the array so the
- * new page can use erp_idx.
- */
- erp = ifp->if_u1.if_ext_irec;
- for (i = nlists - 1; i > erp_idx; i--) {
- memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
- }
- ASSERT(i == erp_idx);
-
- /* Initialize new extent record */
- erp = ifp->if_u1.if_ext_irec;
- erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
- ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
- memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
- erp[erp_idx].er_extcount = 0;
- erp[erp_idx].er_extoff = erp_idx > 0 ?
- erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
- return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx) /* irec index to remove */
-{
- xfs_ext_irec_t *erp; /* indirection array pointer */
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- if (erp->er_extbuf) {
- xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
- -erp->er_extcount);
- kmem_free(erp->er_extbuf);
- }
- /* Compact extent records */
- erp = ifp->if_u1.if_ext_irec;
- for (i = erp_idx; i < nlists - 1; i++) {
- memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
- }
- /*
- * Manually free the last extent record from the indirection
- * array. A call to xfs_iext_realloc_indirect() with a size
- * of zero would result in a call to xfs_iext_destroy() which
- * would in turn call this function again, creating a nasty
- * infinite loop.
- */
- if (--nlists) {
- xfs_iext_realloc_indirect(ifp,
- nlists * sizeof(xfs_ext_irec_t));
- } else {
- kmem_free(ifp->if_u1.if_ext_irec);
- }
- ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array. Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible. The
- * compaction policy is as follows:
- *
- * Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- * No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_extnum_t nextents; /* number of extents in file */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- nextents = xfs_iext_count(ifp);
-
- if (nextents == 0) {
- xfs_iext_destroy(ifp);
- } else if (nextents <= XFS_INLINE_EXTS) {
- xfs_iext_indirect_to_direct(ifp);
- xfs_iext_direct_to_inline(ifp, nextents);
- } else if (nextents <= XFS_LINEAR_EXTS) {
- xfs_iext_indirect_to_direct(ifp);
- } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
- xfs_iext_irec_compact_pages(ifp);
- }
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
- int erp_idx = 0; /* indirection array index */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- while (erp_idx < nlists - 1) {
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- erp_next = erp + 1;
- if (erp_next->er_extcount <=
- (XFS_LINEAR_EXTS - erp->er_extcount)) {
- memcpy(&erp->er_extbuf[erp->er_extcount],
- erp_next->er_extbuf, erp_next->er_extcount *
- sizeof(xfs_bmbt_rec_t));
- erp->er_extcount += erp_next->er_extcount;
- /*
- * Free page before removing extent record
- * so er_extoffs don't get modified in
- * xfs_iext_irec_remove.
- */
- kmem_free(erp_next->er_extbuf);
- erp_next->er_extbuf = NULL;
- xfs_iext_irec_remove(ifp, erp_idx + 1);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- } else {
- erp_idx++;
- }
- }
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
- xfs_ifork_t *ifp, /* inode fork pointer */
- int erp_idx, /* irec index to update */
- int ext_diff) /* number of new extents */
-{
- int i; /* loop counter */
- int nlists; /* number of irec's (ex lists */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- for (i = erp_idx; i < nlists; i++) {
- ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
- }
-}
-
-/*
* Initialize an inode's copy-on-write fork.
*/
void
@@ -1974,61 +831,3 @@ xfs_ifork_init_cow(
ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
ip->i_cnextents = 0;
}
-
-/*
- * Lookup the extent covering bno.
- *
- * If there is an extent covering bno return the extent index, and store the
- * expanded extent structure in *gotp, and the extent index in *idx.
- * If there is no extent covering bno, but there is an extent after it (e.g.
- * it lies in a hole) return that extent in *gotp and its index in *idx
- * instead.
- * If bno is beyond the last extent return false, and return the index after
- * the last valid index in *idxp.
- */
-bool
-xfs_iext_lookup_extent(
- struct xfs_inode *ip,
- struct xfs_ifork *ifp,
- xfs_fileoff_t bno,
- xfs_extnum_t *idxp,
- struct xfs_bmbt_irec *gotp)
-{
- struct xfs_bmbt_rec_host *ep;
-
- XFS_STATS_INC(ip->i_mount, xs_look_exlist);
-
- ep = xfs_iext_bno_to_ext(ifp, bno, idxp);
- if (!ep)
- return false;
- xfs_bmbt_get_all(ep, gotp);
- return true;
-}
-
-/*
- * Return true if there is an extent at index idx, and return the expanded
- * extent structure at idx in that case. Else return false.
- */
-bool
-xfs_iext_get_extent(
- struct xfs_ifork *ifp,
- xfs_extnum_t idx,
- struct xfs_bmbt_irec *gotp)
-{
- if (idx < 0 || idx >= xfs_iext_count(ifp))
- return false;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
- return true;
-}
-
-void
-xfs_iext_update_extent(
- struct xfs_ifork *ifp,
- xfs_extnum_t idx,
- struct xfs_bmbt_irec *gotp)
-{
- ASSERT(idx >= 0);
- ASSERT(idx < xfs_iext_count(ifp));
-
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp);
-}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 11af705219f6..b9f0098e33b8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -22,56 +22,19 @@ struct xfs_inode_log_item;
struct xfs_dinode;
/*
- * The following xfs_ext_irec_t struct introduces a second (top) level
- * to the in-core extent allocation scheme. These structs are allocated
- * in a contiguous block, creating an indirection array where each entry
- * (irec) contains a pointer to a buffer of in-core extent records which
- * it manages. Each extent buffer is 4k in size, since 4k is the system
- * page size on Linux i386 and systems with larger page sizes don't seem
- * to gain much, if anything, by using their native page size as the
- * extent buffer size. Also, using 4k extent buffers everywhere provides
- * a consistent interface for CXFS across different platforms.
- *
- * There is currently no limit on the number of irec's (extent lists)
- * allowed, so heavily fragmented files may require an indirection array
- * which spans multiple system pages of memory. The number of extents
- * which would require this amount of contiguous memory is very large
- * and should not cause problems in the foreseeable future. However,
- * if the memory needed for the contiguous array ever becomes a problem,
- * it is possible that a third level of indirection may be required.
- */
-typedef struct xfs_ext_irec {
- xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
- xfs_extnum_t er_extoff; /* extent offset in file */
- xfs_extnum_t er_extcount; /* number of extents in page/block */
-} xfs_ext_irec_t;
-
-/*
* File incore extent information, present for each of data & attr forks.
*/
-#define XFS_IEXT_BUFSZ 4096
-#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
-#define XFS_INLINE_EXTS 2
-#define XFS_INLINE_DATA 32
typedef struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
int if_real_bytes; /* bytes allocated in if_u1 */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
+ int if_height; /* height of the extent tree */
union {
- xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
- xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
+ void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
- union {
- xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
- /* very small file extents */
- char if_inline_data[XFS_INLINE_DATA];
- /* very small file data */
- xfs_dev_t if_rdev; /* dev number if special */
- uuid_t if_uuid; /* mount point value */
- } if_u2;
} xfs_ifork_t;
/*
@@ -80,7 +43,6 @@ typedef struct xfs_ifork {
#define XFS_IFINLINE 0x01 /* Inline data is read in */
#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
/*
* Fork handling.
@@ -150,45 +112,75 @@ int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
-struct xfs_bmbt_rec_host *
- xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
-xfs_extnum_t xfs_iext_count(struct xfs_ifork *);
-void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
- struct xfs_bmbt_irec *, int);
-void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
- xfs_extnum_t, int);
-void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
-void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
-void xfs_iext_realloc_direct(struct xfs_ifork *, int);
-void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
-void xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
+void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *, int);
+void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
+ int);
void xfs_iext_destroy(struct xfs_ifork *);
-struct xfs_bmbt_rec_host *
- xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
-struct xfs_ext_irec *
- xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
-struct xfs_ext_irec *
- xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
- int);
-void xfs_iext_irec_init(struct xfs_ifork *);
-struct xfs_ext_irec *
- xfs_iext_irec_new(struct xfs_ifork *, int);
-void xfs_iext_irec_remove(struct xfs_ifork *, int);
-void xfs_iext_irec_compact(struct xfs_ifork *);
-void xfs_iext_irec_compact_pages(struct xfs_ifork *);
-void xfs_iext_irec_compact_full(struct xfs_ifork *);
-void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
bool xfs_iext_lookup_extent(struct xfs_inode *ip,
struct xfs_ifork *ifp, xfs_fileoff_t bno,
- xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
-bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *gotp);
-void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+bool xfs_iext_lookup_extent_before(struct xfs_inode *ip,
+ struct xfs_ifork *ifp, xfs_fileoff_t *end,
+ struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *gotp);
+bool xfs_iext_get_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+void xfs_iext_update_extent(struct xfs_inode *ip, int state,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *gotp);
+
+void xfs_iext_first(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_last(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_next(struct xfs_ifork *, struct xfs_iext_cursor *);
+void xfs_iext_prev(struct xfs_ifork *, struct xfs_iext_cursor *);
+
+static inline bool xfs_iext_next_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ xfs_iext_next(ifp, cur);
+ return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+static inline bool xfs_iext_prev_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ xfs_iext_prev(ifp, cur);
+ return xfs_iext_get_extent(ifp, cur, gotp);
+}
+
+/*
+ * Return the extent after cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_next_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_iext_cursor ncur = *cur;
+
+ xfs_iext_next(ifp, &ncur);
+ return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+/*
+ * Return the extent before cur in gotp without updating the cursor.
+ */
+static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_iext_cursor ncur = *cur;
+
+ xfs_iext_prev(ifp, &ncur);
+ return xfs_iext_get_extent(ifp, &ncur, gotp);
+}
+
+#define for_each_xfs_iext(ifp, ext, got) \
+ for (xfs_iext_first((ifp), (ext)); \
+ xfs_iext_get_extent((ifp), (ext), (got)); \
+ xfs_iext_next((ifp), (ext)))
extern struct kmem_zone *xfs_ifork_zone;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 8372e9bcd7b6..349d9f8edb89 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -264,54 +264,43 @@ typedef struct xfs_trans_header {
* (if any) is indicated in the ilf_dsize field. Changes to this structure
* must be added on to the end.
*/
-typedef struct xfs_inode_log_format {
- uint16_t ilf_type; /* inode log item type */
- uint16_t ilf_size; /* size of this item */
- uint32_t ilf_fields; /* flags for fields logged */
- uint16_t ilf_asize; /* size of attr d/ext/root */
- uint16_t ilf_dsize; /* size of data/ext/root */
- uint64_t ilf_ino; /* inode number */
- union {
- uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
- } ilf_u;
- int64_t ilf_blkno; /* blkno of inode buffer */
- int32_t ilf_len; /* len of inode buffer */
- int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_t;
-
-typedef struct xfs_inode_log_format_32 {
+struct xfs_inode_log_format {
uint16_t ilf_type; /* inode log item type */
uint16_t ilf_size; /* size of this item */
uint32_t ilf_fields; /* flags for fields logged */
uint16_t ilf_asize; /* size of attr d/ext/root */
uint16_t ilf_dsize; /* size of data/ext/root */
+ uint32_t ilf_pad; /* pad for 64 bit boundary */
uint64_t ilf_ino; /* inode number */
union {
uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
+ uint8_t __pad[16]; /* unused */
} ilf_u;
int64_t ilf_blkno; /* blkno of inode buffer */
int32_t ilf_len; /* len of inode buffer */
int32_t ilf_boffset; /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
+};
-typedef struct xfs_inode_log_format_64 {
+/*
+ * Old 32 bit systems will log in this format without the 64 bit
+ * alignment padding. Recovery will detect this and convert it to the
+ * correct format.
+ */
+struct xfs_inode_log_format_32 {
uint16_t ilf_type; /* inode log item type */
uint16_t ilf_size; /* size of this item */
uint32_t ilf_fields; /* flags for fields logged */
uint16_t ilf_asize; /* size of attr d/ext/root */
uint16_t ilf_dsize; /* size of data/ext/root */
- uint32_t ilf_pad; /* pad for 64 bit boundary */
uint64_t ilf_ino; /* inode number */
union {
uint32_t ilfu_rdev; /* rdev value for dev inode*/
- uuid_t ilfu_uuid; /* mount point value */
+ uint8_t __pad[16]; /* unused */
} ilf_u;
int64_t ilf_blkno; /* blkno of inode buffer */
int32_t ilf_len; /* len of inode buffer */
int32_t ilf_boffset; /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
+} __attribute__((packed));
/*
@@ -322,7 +311,7 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
#define XFS_ILOG_DEV 0x010 /* log the dev field */
-#define XFS_ILOG_UUID 0x020 /* log the uuid field */
+#define XFS_ILOG_UUID 0x020 /* added long ago, but never used */
#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
@@ -340,9 +329,9 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
- XFS_ILOG_UUID | XFS_ILOG_ADATA | \
- XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
- XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \
+ XFS_ILOG_AOWNER)
#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT)
@@ -352,10 +341,10 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
- XFS_ILOG_DEV | XFS_ILOG_UUID | \
- XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
- XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
+ XFS_ILOG_DEV | XFS_ILOG_ADATA | \
+ XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+ XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \
+ XFS_ILOG_AOWNER)
static inline int xfs_ilog_fbroot(int w)
{
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 9d5406b4f663..585b35d34142 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -30,6 +30,7 @@
#include "xfs_bmap.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 55c88a732690..dd019cee1b3b 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,6 +34,7 @@
#include "xfs_rmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_bmap.h"
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5d4e43ef4eea..3fb29a5ea915 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -672,7 +672,6 @@ xfs_rtmodify_range(
/*
* Compute a mask of relevant bits.
*/
- bit = 0;
mask = ((xfs_rtword_t)1 << lastbit) - 1;
/*
* Set/clear the active bits.
@@ -1086,3 +1085,15 @@ xfs_rtalloc_query_all(
return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
}
+
+/*
+ * Verify that an realtime block number pointer doesn't point off the
+ * end of the realtime device.
+ */
+bool
+xfs_verify_rtbno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return rtbno < mp->m_sb.sb_rblocks;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 0220159bd463..3c560695c546 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -48,6 +48,12 @@ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
typedef int64_t xfs_sfiloff_t; /* signed block number in a file */
/*
+ * New verifiers will return the instruction address of the failing check.
+ * NULL means everything is ok.
+ */
+typedef void * xfs_failaddr_t;
+
+/*
* Null values for the types.
*/
#define NULLFSBLOCK ((xfs_fsblock_t)-1)
@@ -136,5 +142,21 @@ typedef uint32_t xfs_dqid_t;
#define XFS_NBWORD (1 << XFS_NBWORDLOG)
#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
+struct xfs_iext_cursor {
+ struct xfs_iext_leaf *leaf;
+ int pos;
+};
+
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+} xfs_exntst_t;
+
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
new file mode 100644
index 000000000000..2a9b4f9e93c6
--- /dev/null
+++ b/fs/xfs/scrub/agheader.c
@@ -0,0 +1,658 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Set up scrub to check all the static metadata in each AG.
+ * This means the SB, AGF, AGI, and AGFL headers.
+ */
+int
+xfs_scrub_setup_ag_header(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (sc->sm->sm_agno >= mp->m_sb.sb_agcount ||
+ sc->sm->sm_ino || sc->sm->sm_gen)
+ return -EINVAL;
+ return xfs_scrub_setup_fs(sc, ip);
+}
+
+/* Walk all the blocks in the AGFL. */
+int
+xfs_scrub_walk_agfl(
+ struct xfs_scrub_context *sc,
+ int (*fn)(struct xfs_scrub_context *,
+ xfs_agblock_t bno, void *),
+ void *priv)
+{
+ struct xfs_agf *agf;
+ __be32 *agfl_bno;
+ struct xfs_mount *mp = sc->mp;
+ unsigned int flfirst;
+ unsigned int fllast;
+ int i;
+ int error;
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp);
+ flfirst = be32_to_cpu(agf->agf_flfirst);
+ fllast = be32_to_cpu(agf->agf_fllast);
+
+ /* Nothing to walk in an empty AGFL. */
+ if (agf->agf_flcount == cpu_to_be32(0))
+ return 0;
+
+ /* first to last is a consecutive list. */
+ if (fllast >= flfirst) {
+ for (i = flfirst; i <= fllast; i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ return 0;
+ }
+
+ /* first to the end */
+ for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ /* the start to last. */
+ for (i = 0; i <= fllast; i++) {
+ error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Superblock */
+
+/*
+ * Scrub the filesystem superblock.
+ *
+ * Note: We do /not/ attempt to check AG 0's superblock. Mount is
+ * responsible for validating all the geometry information in sb 0, so
+ * if the filesystem is capable of initiating online scrub, then clearly
+ * sb 0 is ok and we can use its information to check everything else.
+ */
+int
+xfs_scrub_superblock(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_dsb *sb;
+ xfs_agnumber_t agno;
+ uint32_t v2_ok;
+ __be32 features_mask;
+ int error;
+ __be16 vernum_mask;
+
+ agno = sc->sm->sm_agno;
+ if (agno == 0)
+ return 0;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (!xfs_scrub_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
+ return error;
+
+ sb = XFS_BUF_TO_SBP(bp);
+
+ /*
+ * Verify the geometries match. Fields that are permanently
+ * set by mkfs are checked; fields that can be updated later
+ * (and are not propagated to backup superblocks) are preen
+ * checked.
+ */
+ if (sb->sb_blocksize != cpu_to_be32(mp->m_sb.sb_blocksize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_dblocks != cpu_to_be64(mp->m_sb.sb_dblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rblocks != cpu_to_be64(mp->m_sb.sb_rblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rextents != cpu_to_be64(mp->m_sb.sb_rextents))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (!uuid_equal(&sb->sb_uuid, &mp->m_sb.sb_uuid))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_logstart != cpu_to_be64(mp->m_sb.sb_logstart))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agblocks != cpu_to_be32(mp->m_sb.sb_agblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agcount != cpu_to_be32(mp->m_sb.sb_agcount))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rbmblocks != cpu_to_be32(mp->m_sb.sb_rbmblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logblocks != cpu_to_be32(mp->m_sb.sb_logblocks))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_versionnum bits that are set at mkfs time. */
+ vernum_mask = cpu_to_be16(~XFS_SB_VERSION_OKBITS |
+ XFS_SB_VERSION_NUMBITS |
+ XFS_SB_VERSION_ALIGNBIT |
+ XFS_SB_VERSION_DALIGNBIT |
+ XFS_SB_VERSION_SHAREDBIT |
+ XFS_SB_VERSION_LOGV2BIT |
+ XFS_SB_VERSION_SECTORBIT |
+ XFS_SB_VERSION_EXTFLGBIT |
+ XFS_SB_VERSION_DIRV2BIT);
+ if ((sb->sb_versionnum & vernum_mask) !=
+ (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_versionnum bits that can be set after mkfs time. */
+ vernum_mask = cpu_to_be16(XFS_SB_VERSION_ATTRBIT |
+ XFS_SB_VERSION_NLINKBIT |
+ XFS_SB_VERSION_QUOTABIT);
+ if ((sb->sb_versionnum & vernum_mask) !=
+ (cpu_to_be16(mp->m_sb.sb_versionnum) & vernum_mask))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_sectsize != cpu_to_be16(mp->m_sb.sb_sectsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inodesize != cpu_to_be16(mp->m_sb.sb_inodesize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inopblock != cpu_to_be16(mp->m_sb.sb_inopblock))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (memcmp(sb->sb_fname, mp->m_sb.sb_fname, sizeof(sb->sb_fname)))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_blocklog != mp->m_sb.sb_blocklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_sectlog != mp->m_sb.sb_sectlog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inodelog != mp->m_sb.sb_inodelog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inopblog != mp->m_sb.sb_inopblog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_agblklog != mp->m_sb.sb_agblklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_imax_pct != mp->m_sb.sb_imax_pct)
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /*
+ * Skip the summary counters since we track them in memory anyway.
+ * sb_icount, sb_ifree, sb_fdblocks, sb_frexents
+ */
+
+ if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /*
+ * Skip the quota flags since repair will force quotacheck.
+ * sb_qflags
+ */
+
+ if (sb->sb_flags != mp->m_sb.sb_flags)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_shared_vn != mp->m_sb.sb_shared_vn)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_inoalignmt != cpu_to_be32(mp->m_sb.sb_inoalignmt))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_unit != cpu_to_be32(mp->m_sb.sb_unit))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_width != cpu_to_be32(mp->m_sb.sb_width))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ if (sb->sb_dirblklog != mp->m_sb.sb_dirblklog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsectlog != mp->m_sb.sb_logsectlog)
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsectsize != cpu_to_be16(mp->m_sb.sb_logsectsize))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_logsunit != cpu_to_be32(mp->m_sb.sb_logsunit))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Do we see any invalid bits in sb_features2? */
+ if (!xfs_sb_version_hasmorebits(&mp->m_sb)) {
+ if (sb->sb_features2 != 0)
+ xfs_scrub_block_set_corrupt(sc, bp);
+ } else {
+ v2_ok = XFS_SB_VERSION2_OKBITS;
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5)
+ v2_ok |= XFS_SB_VERSION2_CRCBIT;
+
+ if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_features2 != sb->sb_bad_features2)
+ xfs_scrub_block_set_preen(sc, bp);
+ }
+
+ /* Check sb_features2 flags that are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_VERSION2_LAZYSBCOUNTBIT |
+ XFS_SB_VERSION2_PROJID32BIT |
+ XFS_SB_VERSION2_CRCBIT |
+ XFS_SB_VERSION2_FTYPE);
+ if ((sb->sb_features2 & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check sb_features2 flags that can be set after mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_VERSION2_ATTR2BIT);
+ if ((sb->sb_features2 & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features2) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+ /* all v5 fields must be zero */
+ if (memchr_inv(&sb->sb_features_compat, 0,
+ sizeof(struct xfs_dsb) -
+ offsetof(struct xfs_dsb, sb_features_compat)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+ } else {
+ /* Check compat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_COMPAT_UNKNOWN);
+ if ((sb->sb_features_compat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_compat) & features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check ro compat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_RO_COMPAT_UNKNOWN |
+ XFS_SB_FEAT_RO_COMPAT_FINOBT |
+ XFS_SB_FEAT_RO_COMPAT_RMAPBT |
+ XFS_SB_FEAT_RO_COMPAT_REFLINK);
+ if ((sb->sb_features_ro_compat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_ro_compat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check incompat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_UNKNOWN |
+ XFS_SB_FEAT_INCOMPAT_FTYPE |
+ XFS_SB_FEAT_INCOMPAT_SPINODES |
+ XFS_SB_FEAT_INCOMPAT_META_UUID);
+ if ((sb->sb_features_incompat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_incompat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Check log incompat flags; all are set at mkfs time. */
+ features_mask = cpu_to_be32(XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN);
+ if ((sb->sb_features_log_incompat & features_mask) !=
+ (cpu_to_be32(mp->m_sb.sb_features_log_incompat) &
+ features_mask))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ /* Don't care about sb_crc */
+
+ if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+ xfs_scrub_block_set_preen(sc, bp);
+
+ /* Don't care about sb_lsn */
+ }
+
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb)) {
+ /* The metadata UUID must be the same for all supers */
+ if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+ xfs_scrub_block_set_corrupt(sc, bp);
+ }
+
+ /* Everything else must be zero. */
+ if (memchr_inv(sb + 1, 0,
+ BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
+ xfs_scrub_block_set_corrupt(sc, bp);
+
+ return error;
+}
+
+/* AGF */
+
+/* Scrub the AGF. */
+int
+xfs_scrub_agf(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t eoag;
+ xfs_agblock_t agfl_first;
+ xfs_agblock_t agfl_last;
+ xfs_agblock_t agfl_count;
+ xfs_agblock_t fl_count;
+ int level;
+ int error = 0;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGF_BLOCK(sc->mp), &error))
+ goto out;
+
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+
+ /* Check the AG length */
+ eoag = be32_to_cpu(agf->agf_length);
+ if (eoag != xfs_ag_block_count(mp, agno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ /* Check the AGF btree roots and levels */
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ }
+
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agbno = be32_to_cpu(agf->agf_refcount_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+ level = be32_to_cpu(agf->agf_refcount_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ }
+
+ /* Check the AGFL counters */
+ agfl_first = be32_to_cpu(agf->agf_flfirst);
+ agfl_last = be32_to_cpu(agf->agf_fllast);
+ agfl_count = be32_to_cpu(agf->agf_flcount);
+ if (agfl_last > agfl_first)
+ fl_count = agfl_last - agfl_first + 1;
+ else
+ fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1;
+ if (agfl_count != 0 && fl_count != agfl_count)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+
+out:
+ return error;
+}
+
+/* AGFL */
+
+struct xfs_scrub_agfl_info {
+ unsigned int sz_entries;
+ unsigned int nr_entries;
+ xfs_agblock_t *entries;
+};
+
+/* Scrub an AGFL block. */
+STATIC int
+xfs_scrub_agfl_block(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_agfl_info *sai = priv;
+ xfs_agnumber_t agno = sc->sa.agno;
+
+ if (xfs_verify_agbno(mp, agno, agbno) &&
+ sai->nr_entries < sai->sz_entries)
+ sai->entries[sai->nr_entries++] = agbno;
+ else
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agfl_bp);
+
+ return 0;
+}
+
+static int
+xfs_scrub_agblock_cmp(
+ const void *pa,
+ const void *pb)
+{
+ const xfs_agblock_t *a = pa;
+ const xfs_agblock_t *b = pb;
+
+ return (int)*a - (int)*b;
+}
+
+/* Scrub the AGFL. */
+int
+xfs_scrub_agfl(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_agfl_info sai = { 0 };
+ struct xfs_agf *agf;
+ xfs_agnumber_t agno;
+ unsigned int agflcount;
+ unsigned int i;
+ int error;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+ goto out;
+ if (!sc->sa.agf_bp)
+ return -EFSCORRUPTED;
+
+ /* Allocate buffer to ensure uniqueness of AGFL entries. */
+ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agflcount = be32_to_cpu(agf->agf_flcount);
+ if (agflcount > XFS_AGFL_SIZE(sc->mp)) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ goto out;
+ }
+ sai.sz_entries = agflcount;
+ sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
+ if (!sai.entries) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ /* Check the blocks in the AGFL. */
+ error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai);
+ if (error)
+ goto out_free;
+
+ if (agflcount != sai.nr_entries) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ goto out_free;
+ }
+
+ /* Sort entries, check for duplicates. */
+ sort(sai.entries, sai.nr_entries, sizeof(sai.entries[0]),
+ xfs_scrub_agblock_cmp, NULL);
+ for (i = 1; i < sai.nr_entries; i++) {
+ if (sai.entries[i] == sai.entries[i - 1]) {
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp);
+ break;
+ }
+ }
+
+out_free:
+ kmem_free(sai.entries);
+out:
+ return error;
+}
+
+/* AGI */
+
+/* Scrub the AGI. */
+int
+xfs_scrub_agi(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agi *agi;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_agblock_t eoag;
+ xfs_agino_t agino;
+ xfs_agino_t first_agino;
+ xfs_agino_t last_agino;
+ xfs_agino_t icount;
+ int i;
+ int level;
+ int error = 0;
+
+ agno = sc->sa.agno = sc->sm->sm_agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sc->sa.agi_bp,
+ &sc->sa.agf_bp, &sc->sa.agfl_bp);
+ if (!xfs_scrub_process_error(sc, agno, XFS_AGI_BLOCK(sc->mp), &error))
+ goto out;
+
+ agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+
+ /* Check the AG length */
+ eoag = be32_to_cpu(agi->agi_length);
+ if (eoag != xfs_ag_block_count(mp, agno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check btree roots and levels */
+ agbno = be32_to_cpu(agi->agi_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ level = be32_to_cpu(agi->agi_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agbno = be32_to_cpu(agi->agi_free_root);
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ level = be32_to_cpu(agi->agi_free_level);
+ if (level <= 0 || level > XFS_BTREE_MAXLEVELS)
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ /* Check inode counters */
+ xfs_ialloc_agino_range(mp, agno, &first_agino, &last_agino);
+ icount = be32_to_cpu(agi->agi_count);
+ if (icount > last_agino - first_agino + 1 ||
+ icount < be32_to_cpu(agi->agi_freecount))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check inode pointers */
+ agino = be32_to_cpu(agi->agi_newino);
+ if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ agino = be32_to_cpu(agi->agi_dirino);
+ if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+ /* Check unlinked inode buckets */
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
+ agino = be32_to_cpu(agi->agi_unlinked[i]);
+ if (agino == NULLAGINO)
+ continue;
+ if (!xfs_verify_agino(mp, agno, agino))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (agi->agi_pad32 != cpu_to_be32(0))
+ xfs_scrub_block_set_corrupt(sc, sc->sa.agi_bp);
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
new file mode 100644
index 000000000000..059663e13414
--- /dev/null
+++ b/fs/xfs/scrub/alloc.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub free space btrees.
+ */
+int
+xfs_scrub_setup_ag_allocbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Free space btree scrubber. */
+
+/* Scrub a bnobt/cntbt record. */
+STATIC int
+xfs_scrub_allocbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ int error = 0;
+
+ bno = be32_to_cpu(rec->alloc.ar_startblock);
+ len = be32_to_cpu(rec->alloc.ar_blockcount);
+
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ return error;
+}
+
+/* Scrub the freespace btrees for some AG. */
+STATIC int
+xfs_scrub_allocbt(
+ struct xfs_scrub_context *sc,
+ xfs_btnum_t which)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_btree_cur *cur;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
+ return xfs_scrub_btree(sc, cur, xfs_scrub_allocbt_rec, &oinfo, NULL);
+}
+
+int
+xfs_scrub_bnobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_allocbt(sc, XFS_BTNUM_BNO);
+}
+
+int
+xfs_scrub_cntbt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_allocbt(sc, XFS_BTNUM_CNT);
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
new file mode 100644
index 000000000000..4ed80474f545
--- /dev/null
+++ b/fs/xfs/scrub/attr.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/dabtree.h"
+#include "scrub/trace.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Set us up to scrub an inode's extended attributes. */
+int
+xfs_scrub_setup_xattr(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ size_t sz;
+
+ /*
+ * Allocate the buffer without the inode lock held. We need enough
+ * space to read every xattr value in the file or enough space to
+ * hold three copies of the xattr free space bitmap. (Not both at
+ * the same time.)
+ */
+ sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) *
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize));
+ sc->buf = kmem_zalloc_large(sz, KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Extended Attributes */
+
+struct xfs_scrub_xattr {
+ struct xfs_attr_list_context context;
+ struct xfs_scrub_context *sc;
+};
+
+/*
+ * Check that an extended attribute key can be looked up by hash.
+ *
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked)
+ * to call this function for every attribute key in an inode. Once
+ * we're here, we load the attribute value to see if any errors happen,
+ * or if we get more or less data than we expected.
+ */
+static void
+xfs_scrub_xattr_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen)
+{
+ struct xfs_scrub_xattr *sx;
+ struct xfs_da_args args = { NULL };
+ int error = 0;
+
+ sx = container_of(context, struct xfs_scrub_xattr, context);
+
+ if (flags & XFS_ATTR_INCOMPLETE) {
+ /* Incomplete attr key, just mark the inode for preening. */
+ xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL);
+ return;
+ }
+
+ args.flags = ATTR_KERNOTIME;
+ if (flags & XFS_ATTR_ROOT)
+ args.flags |= ATTR_ROOT;
+ else if (flags & XFS_ATTR_SECURE)
+ args.flags |= ATTR_SECURE;
+ args.geo = context->dp->i_mount->m_attr_geo;
+ args.whichfork = XFS_ATTR_FORK;
+ args.dp = context->dp;
+ args.name = name;
+ args.namelen = namelen;
+ args.hashval = xfs_da_hashname(args.name, args.namelen);
+ args.trans = context->tp;
+ args.value = sx->sc->buf;
+ args.valuelen = XATTR_SIZE_MAX;
+
+ error = xfs_attr_get_ilocked(context->dp, &args);
+ if (error == -EEXIST)
+ error = 0;
+ if (!xfs_scrub_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
+ &error))
+ goto fail_xref;
+ if (args.valuelen != valuelen)
+ xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
+ args.blkno);
+
+fail_xref:
+ return;
+}
+
+/*
+ * Mark a range [start, start+len) in this map. Returns true if the
+ * region was free, and false if there's a conflict or a problem.
+ *
+ * Within a char, the lowest bit of the char represents the byte with
+ * the smallest address
+ */
+STATIC bool
+xfs_scrub_xattr_set_map(
+ struct xfs_scrub_context *sc,
+ unsigned long *map,
+ unsigned int start,
+ unsigned int len)
+{
+ unsigned int mapsize = sc->mp->m_attr_geo->blksize;
+ bool ret = true;
+
+ if (start >= mapsize)
+ return false;
+ if (start + len > mapsize) {
+ len = mapsize - start;
+ ret = false;
+ }
+
+ if (find_next_bit(map, mapsize, start) < start + len)
+ ret = false;
+ bitmap_set(map, start, len);
+
+ return ret;
+}
+
+/*
+ * Check the leaf freemap from the usage bitmap. Returns false if the
+ * attr freemap has problems or points to used space.
+ */
+STATIC bool
+xfs_scrub_xattr_check_freemap(
+ struct xfs_scrub_context *sc,
+ unsigned long *map,
+ struct xfs_attr3_icleaf_hdr *leafhdr)
+{
+ unsigned long *freemap;
+ unsigned long *dstmap;
+ unsigned int mapsize = sc->mp->m_attr_geo->blksize;
+ int i;
+
+ /* Construct bitmap of freemap contents. */
+ freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize);
+ bitmap_zero(freemap, mapsize);
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+ if (!xfs_scrub_xattr_set_map(sc, freemap,
+ leafhdr->freemap[i].base,
+ leafhdr->freemap[i].size))
+ return false;
+ }
+
+ /* Look for bits that are set in freemap and are marked in use. */
+ dstmap = freemap + BITS_TO_LONGS(mapsize);
+ return bitmap_and(dstmap, freemap, map, mapsize) == 0;
+}
+
+/*
+ * Check this leaf entry's relations to everything else.
+ * Returns the number of bytes used for the name/value data.
+ */
+STATIC void
+xfs_scrub_xattr_entry(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ char *buf_end,
+ struct xfs_attr_leafblock *leaf,
+ struct xfs_attr3_icleaf_hdr *leafhdr,
+ unsigned long *usedmap,
+ struct xfs_attr_leaf_entry *ent,
+ int idx,
+ unsigned int *usedbytes,
+ __u32 *last_hashval)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ char *name_end;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ unsigned int nameidx;
+ unsigned int namesize;
+
+ if (ent->pad2 != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ /* Hash values in order? */
+ if (be32_to_cpu(ent->hashval) < *last_hashval)
+ xfs_scrub_da_set_corrupt(ds, level);
+ *last_hashval = be32_to_cpu(ent->hashval);
+
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr->firstused ||
+ nameidx >= mp->m_attr_geo->blksize) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return;
+ }
+
+ /* Check the name information. */
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0 || rentry->valueblk == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+ if (name_end > buf_end)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, nameidx, namesize))
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ *usedbytes += namesize;
+}
+
+/* Scrub an attribute leaf. */
+STATIC int
+xfs_scrub_xattr_block(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
+ struct xfs_buf *bp = blk->bp;
+ xfs_dablk_t *last_checked = ds->private;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *ent;
+ struct xfs_attr_leaf_entry *entries;
+ unsigned long *usedmap = ds->sc->buf;
+ char *buf_end;
+ size_t off;
+ __u32 last_hashval = 0;
+ unsigned int usedbytes = 0;
+ unsigned int hdrsize;
+ int i;
+
+ if (*last_checked == blk->blkno)
+ return 0;
+ *last_checked = blk->blkno;
+ bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+
+ /* Check all the padding. */
+ if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) {
+ struct xfs_attr3_leafblock *leaf = bp->b_addr;
+
+ if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
+ leaf->hdr.info.hdr.pad != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ } else {
+ if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+
+ /* Check the leaf header */
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
+ hdrsize = xfs_attr3_leaf_hdr_size(leaf);
+
+ if (leafhdr.usedbytes > mp->m_attr_geo->blksize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.firstused > mp->m_attr_geo->blksize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (leafhdr.firstused < hdrsize)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ entries = xfs_attr3_leaf_entryp(leaf);
+ if ((char *)&entries[leafhdr.count] > (char *)leaf + leafhdr.firstused)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
+ /* Mark the leaf entry itself. */
+ off = (char *)ent - (char *)leaf;
+ if (!xfs_scrub_xattr_set_map(ds->sc, usedmap, off,
+ sizeof(xfs_attr_leaf_entry_t))) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+
+ /* Check the entry and nameval. */
+ xfs_scrub_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
+ usedmap, ent, i, &usedbytes, &last_hashval);
+
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+ }
+
+ if (!xfs_scrub_xattr_check_freemap(ds->sc, usedmap, &leafhdr))
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ if (leafhdr.usedbytes != usedbytes)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+out:
+ return 0;
+}
+
+/* Scrub a attribute btree record. */
+STATIC int
+xfs_scrub_xattr_rec(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ void *rec)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_attr_leaf_entry *ent = rec;
+ struct xfs_da_state_blk *blk;
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ struct xfs_buf *bp;
+ xfs_dahash_t calc_hash;
+ xfs_dahash_t hash;
+ int nameidx;
+ int hdrsize;
+ unsigned int badflags;
+ int error;
+
+ blk = &ds->state->path.blk[level];
+
+ /* Check the whole block, if necessary. */
+ error = xfs_scrub_xattr_block(ds, level);
+ if (error)
+ goto out;
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check the hash of the entry. */
+ error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval);
+ if (error)
+ goto out;
+
+ /* Find the attr entry's location. */
+ bp = blk->bp;
+ hdrsize = xfs_attr3_leaf_hdr_size(bp->b_addr);
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < hdrsize || nameidx >= mp->m_attr_geo->blksize) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+
+ /* Retrieve the entry and check it. */
+ hash = be32_to_cpu(ent->hashval);
+ badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE |
+ XFS_ATTR_INCOMPLETE);
+ if ((ent->flags & badflags) != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = (struct xfs_attr_leaf_name_local *)
+ (((char *)bp->b_addr) + nameidx);
+ if (lentry->namelen <= 0) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_da_hashname(lentry->nameval, lentry->namelen);
+ } else {
+ rentry = (struct xfs_attr_leaf_name_remote *)
+ (((char *)bp->b_addr) + nameidx);
+ if (rentry->namelen <= 0) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ calc_hash = xfs_da_hashname(rentry->name, rentry->namelen);
+ }
+ if (calc_hash != hash)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+out:
+ return error;
+}
+
+/* Scrub the extended attribute metadata. */
+int
+xfs_scrub_xattr(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_xattr sx;
+ struct attrlist_cursor_kern cursor = { 0 };
+ xfs_dablk_t last_checked = -1U;
+ int error = 0;
+
+ if (!xfs_inode_hasattr(sc->ip))
+ return -ENOENT;
+
+ memset(&sx, 0, sizeof(sx));
+ /* Check attribute tree structure */
+ error = xfs_scrub_da_btree(sc, XFS_ATTR_FORK, xfs_scrub_xattr_rec,
+ &last_checked);
+ if (error)
+ goto out;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check that every attr key can also be looked up by hash. */
+ sx.context.dp = sc->ip;
+ sx.context.cursor = &cursor;
+ sx.context.resynch = 1;
+ sx.context.put_listent = xfs_scrub_xattr_listent;
+ sx.context.tp = sc->tp;
+ sx.context.flags = ATTR_INCOMPLETE;
+ sx.sc = sc;
+
+ /*
+ * Look up every xattr in this file by name.
+ *
+ * Use the backend implementation of xfs_attr_list to call
+ * xfs_scrub_xattr_listent on every attribute key in this inode.
+ * In other words, we use the same iterator/callback mechanism
+ * that listattr uses to scrub extended attributes, though in our
+ * _listent function, we check the value of the attribute.
+ *
+ * The VFS only locks i_rwsem when modifying attrs, so keep all
+ * three locks held because that's the only way to ensure we're
+ * the only thread poking into the da btree. We traverse the da
+ * btree while holding a leaf buffer locked for the xattr name
+ * iteration, which doesn't really follow the usual buffer
+ * locking order.
+ */
+ error = xfs_attr_list_int_ilocked(&sx.context);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
+ goto out;
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
new file mode 100644
index 000000000000..42fec0bcd9e1
--- /dev/null
+++ b/fs/xfs/scrub/bmap.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Set us up with an inode's bmap. */
+int
+xfs_scrub_setup_inode_bmap(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_scrub_get_inode(sc, ip);
+ if (error)
+ goto out;
+
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /*
+ * We don't want any ephemeral data fork updates sitting around
+ * while we inspect block mappings, so wait for directio to finish
+ * and flush dirty data if we have delalloc reservations.
+ */
+ if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
+ sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+ inode_dio_wait(VFS_I(sc->ip));
+ error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
+
+ /* Got the inode, lock it and we're ready to go. */
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode */
+ return error;
+}
+
+/*
+ * Inode fork block mapping (BMBT) scrubber.
+ * More complex than the others because we have to scrub
+ * all the extents regardless of whether or not the fork
+ * is in btree format.
+ */
+
+struct xfs_scrub_bmap_info {
+ struct xfs_scrub_context *sc;
+ xfs_fileoff_t lastoff;
+ bool is_rt;
+ bool is_shared;
+ int whichfork;
+};
+
+/* Scrub a single extent record. */
+STATIC int
+xfs_scrub_bmap_extent(
+ struct xfs_inode *ip,
+ struct xfs_btree_cur *cur,
+ struct xfs_scrub_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+ struct xfs_buf *bp = NULL;
+ int error = 0;
+
+ if (cur)
+ xfs_btree_get_block(cur, 0, &bp);
+
+ /*
+ * Check for out-of-order extents. This record could have come
+ * from the incore list, for which there is no ordering check.
+ */
+ if (irec->br_startoff < info->lastoff)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* There should never be a "hole" extent in either extent list. */
+ if (irec->br_startblock == HOLESTARTBLOCK)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /*
+ * Check for delalloc extents. We never iterate the ones in the
+ * in-core extent scan, and we should never see these in the bmbt.
+ */
+ if (isnullstartblock(irec->br_startblock))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Make sure the extent points to a valid place. */
+ if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (info->is_rt &&
+ (!xfs_verify_rtbno(mp, irec->br_startblock) ||
+ !xfs_verify_rtbno(mp, irec->br_startblock +
+ irec->br_blockcount - 1)))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (!info->is_rt &&
+ (!xfs_verify_fsbno(mp, irec->br_startblock) ||
+ !xfs_verify_fsbno(mp, irec->br_startblock +
+ irec->br_blockcount - 1)))
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* We don't allow unwritten extents on attr forks. */
+ if (irec->br_state == XFS_EXT_UNWRITTEN &&
+ info->whichfork == XFS_ATTR_FORK)
+ xfs_scrub_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ info->lastoff = irec->br_startoff + irec->br_blockcount;
+ return error;
+}
+
+/* Scrub a bmbt record. */
+STATIC int
+xfs_scrub_bmapbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_info *info = bs->private;
+ struct xfs_inode *ip = bs->cur->bc_private.b.ip;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block;
+ uint64_t owner;
+ int i;
+
+ /*
+ * Check the owners of the btree blocks up to the level below
+ * the root since the verifiers don't do that.
+ */
+ if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) &&
+ bs->cur->bc_ptrs[0] == 1) {
+ for (i = 0; i < bs->cur->bc_nlevels - 1; i++) {
+ block = xfs_btree_get_block(bs->cur, i, &bp);
+ owner = be64_to_cpu(block->bb_u.l.bb_owner);
+ if (owner != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(bs->sc,
+ info->whichfork, 0);
+ }
+ }
+
+ /* Set up the in-core record and scrub it. */
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ return xfs_scrub_bmap_extent(ip, bs->cur, info, &irec);
+}
+
+/* Scan the btree records. */
+STATIC int
+xfs_scrub_bmap_btree(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ struct xfs_scrub_bmap_info *info)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
+ error = xfs_scrub_btree(sc, cur, xfs_scrub_bmapbt_rec, &oinfo, info);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ return error;
+}
+
+/*
+ * Scrub an inode fork's block mappings.
+ *
+ * First we scan every record in every btree block, if applicable.
+ * Then we unconditionally scan the incore extent cache.
+ */
+STATIC int
+xfs_scrub_bmap(
+ struct xfs_scrub_context *sc,
+ int whichfork)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_scrub_bmap_info info = { NULL };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t endoff;
+ struct xfs_iext_cursor icur;
+ bool found;
+ int error = 0;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+ info.whichfork = whichfork;
+ info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
+ info.sc = sc;
+
+ switch (whichfork) {
+ case XFS_COW_FORK:
+ /* Non-existent CoW forks are ignorable. */
+ if (!ifp)
+ goto out;
+ /* No CoW forks on non-reflink inodes/filesystems. */
+ if (!xfs_is_reflink_inode(ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ goto out;
+ }
+ break;
+ case XFS_ATTR_FORK:
+ if (!ifp)
+ goto out;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) &&
+ !xfs_sb_version_hasattr2(&mp->m_sb))
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ break;
+ default:
+ ASSERT(whichfork == XFS_DATA_FORK);
+ break;
+ }
+
+ /* Check the fork values */
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_UUID:
+ case XFS_DINODE_FMT_DEV:
+ case XFS_DINODE_FMT_LOCAL:
+ /* No mappings to check. */
+ goto out;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (whichfork == XFS_COW_FORK) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+
+ error = xfs_scrub_bmap_btree(sc, whichfork, &info);
+ if (error)
+ goto out;
+ break;
+ default:
+ xfs_scrub_fblock_set_corrupt(sc, whichfork, 0);
+ goto out;
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Now try to scrub the in-memory extent list. */
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(sc->tp, ip, whichfork);
+ if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+ }
+
+ /* Find the offset of the last extent in the mapping. */
+ error = xfs_bmap_last_offset(ip, &endoff, whichfork);
+ if (!xfs_scrub_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+
+ /* Scrub extent records. */
+ info.lastoff = 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ for (found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &irec);
+ found != 0;
+ found = xfs_iext_next_extent(ifp, &icur, &irec)) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+ if (isnullstartblock(irec.br_startblock))
+ continue;
+ if (irec.br_startoff >= endoff) {
+ xfs_scrub_fblock_set_corrupt(sc, whichfork,
+ irec.br_startoff);
+ goto out;
+ }
+ error = xfs_scrub_bmap_extent(ip, NULL, &info, &irec);
+ if (error)
+ goto out;
+ }
+
+out:
+ return error;
+}
+
+/* Scrub an inode's data fork. */
+int
+xfs_scrub_bmap_data(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_bmap(sc, XFS_DATA_FORK);
+}
+
+/* Scrub an inode's attr fork. */
+int
+xfs_scrub_bmap_attr(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_bmap(sc, XFS_ATTR_FORK);
+}
+
+/* Scrub an inode's CoW fork. */
+int
+xfs_scrub_bmap_cow(
+ struct xfs_scrub_context *sc)
+{
+ if (!xfs_is_reflink_inode(sc->ip))
+ return -ENOENT;
+
+ return xfs_scrub_bmap(sc, XFS_COW_FORK);
+}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
new file mode 100644
index 000000000000..df0766132ace
--- /dev/null
+++ b/fs/xfs/scrub/btree.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* btree scrubbing */
+
+/*
+ * Check for btree operation errors. See the section about handling
+ * operational errors in common.c.
+ */
+bool
+xfs_scrub_btree_process_error(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level,
+ int *error)
+{
+ if (*error == 0)
+ return true;
+
+ switch (*error) {
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ trace_xfs_scrub_ifork_btree_op_error(sc, cur, level,
+ *error, __return_address);
+ else
+ trace_xfs_scrub_btree_op_error(sc, cur, level,
+ *error, __return_address);
+ break;
+ }
+ return false;
+}
+
+/* Record btree block corruption. */
+void
+xfs_scrub_btree_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ trace_xfs_scrub_ifork_btree_error(sc, cur, level,
+ __return_address);
+ else
+ trace_xfs_scrub_btree_error(sc, cur, level,
+ __return_address);
+}
+
+/*
+ * Make sure this record is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xfs_scrub_btree_rec(
+ struct xfs_scrub_btree *bs)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_rec *rec;
+ union xfs_btree_key key;
+ union xfs_btree_key hkey;
+ union xfs_btree_key *keyp;
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *keyblock;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, 0, &bp);
+ rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+
+ trace_xfs_scrub_btree_rec(bs->sc, cur, 0);
+
+ /* If this isn't the first record, are they in order? */
+ if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 0);
+ bs->firstrec = false;
+ memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
+
+ if (cur->bc_nlevels == 1)
+ return;
+
+ /* Is this at least as large as the parent low key? */
+ cur->bc_ops->init_key_from_rec(&key, rec);
+ keyblock = xfs_btree_get_block(cur, 1, &bp);
+ keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Is this no larger than the parent high key? */
+ cur->bc_ops->init_high_key_from_rec(&hkey, rec);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Make sure this key is in order and doesn't stray outside of the parent
+ * keys.
+ */
+STATIC void
+xfs_scrub_btree_key(
+ struct xfs_scrub_btree *bs,
+ int level)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_key *key;
+ union xfs_btree_key *keyp;
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *keyblock;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
+
+ trace_xfs_scrub_btree_key(bs->sc, cur, level);
+
+ /* If this isn't the first key, are they in order? */
+ if (!bs->firstkey[level] &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ bs->firstkey[level] = false;
+ memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len);
+
+ if (level + 1 >= cur->bc_nlevels)
+ return;
+
+ /* Is this at least as large as the parent low key? */
+ keyblock = xfs_btree_get_block(cur, level + 1, &bp);
+ keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Is this no larger than the parent high key? */
+ key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
+ keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock);
+ if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+}
+
+/*
+ * Check a btree pointer. Returns true if it's ok to use this pointer.
+ * Callers do not need to set the corrupt flag.
+ */
+static bool
+xfs_scrub_btree_ptr_ok(
+ struct xfs_scrub_btree *bs,
+ int level,
+ union xfs_btree_ptr *ptr)
+{
+ bool res;
+
+ /* A btree rooted in an inode has no block pointer to the root. */
+ if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == bs->cur->bc_nlevels)
+ return true;
+
+ /* Otherwise, check the pointers. */
+ if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
+ else
+ res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
+ if (!res)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+
+ return res;
+}
+
+/* Check that a btree block's sibling matches what we expect it. */
+STATIC int
+xfs_scrub_btree_block_check_sibling(
+ struct xfs_scrub_btree *bs,
+ int level,
+ int direction,
+ union xfs_btree_ptr *sibling)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ struct xfs_btree_block *pblock;
+ struct xfs_buf *pbp;
+ struct xfs_btree_cur *ncur = NULL;
+ union xfs_btree_ptr *pp;
+ int success;
+ int error;
+
+ error = xfs_btree_dup_cursor(cur, &ncur);
+ if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error) ||
+ !ncur)
+ return error;
+
+ /*
+ * If the pointer is null, we shouldn't be able to move the upper
+ * level pointer anywhere.
+ */
+ if (xfs_btree_ptr_is_null(cur, sibling)) {
+ if (direction > 0)
+ error = xfs_btree_increment(ncur, level + 1, &success);
+ else
+ error = xfs_btree_decrement(ncur, level + 1, &success);
+ if (error == 0 && success)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ error = 0;
+ goto out;
+ }
+
+ /* Increment upper level pointer. */
+ if (direction > 0)
+ error = xfs_btree_increment(ncur, level + 1, &success);
+ else
+ error = xfs_btree_decrement(ncur, level + 1, &success);
+ if (!xfs_scrub_btree_process_error(bs->sc, cur, level + 1, &error))
+ goto out;
+ if (!success) {
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level + 1);
+ goto out;
+ }
+
+ /* Compare upper level pointer to sibling pointer. */
+ pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
+ pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock);
+ if (!xfs_scrub_btree_ptr_ok(bs, level + 1, pp))
+ goto out;
+
+ if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+out:
+ xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/* Check the siblings of a btree block. */
+STATIC int
+xfs_scrub_btree_block_check_siblings(
+ struct xfs_scrub_btree *bs,
+ struct xfs_btree_block *block)
+{
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_ptr leftsib;
+ union xfs_btree_ptr rightsib;
+ int level;
+ int error = 0;
+
+ xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
+ xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
+ level = xfs_btree_get_level(block);
+
+ /* Root block should never have siblings. */
+ if (level == cur->bc_nlevels - 1) {
+ if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
+ !xfs_btree_ptr_is_null(cur, &rightsib))
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, level);
+ goto out;
+ }
+
+ /*
+ * Does the left & right sibling pointers match the adjacent
+ * parent level pointers?
+ * (These function absorbs error codes for us.)
+ */
+ error = xfs_scrub_btree_block_check_sibling(bs, level, -1, &leftsib);
+ if (error)
+ return error;
+ error = xfs_scrub_btree_block_check_sibling(bs, level, 1, &rightsib);
+ if (error)
+ return error;
+out:
+ return error;
+}
+
+/*
+ * Grab and scrub a btree block given a btree pointer. Returns block
+ * and buffer pointers (if applicable) if they're ok to use.
+ */
+STATIC int
+xfs_scrub_btree_get_block(
+ struct xfs_scrub_btree *bs,
+ int level,
+ union xfs_btree_ptr *pp,
+ struct xfs_btree_block **pblock,
+ struct xfs_buf **pbp)
+{
+ void *failed_at;
+ int error;
+
+ *pblock = NULL;
+ *pbp = NULL;
+
+ error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, level, &error) ||
+ !*pblock)
+ return error;
+
+ xfs_btree_get_block(bs->cur, level, pbp);
+ if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
+ level, *pbp);
+ else
+ failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
+ level, *pbp);
+ if (failed_at) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+ return 0;
+ }
+
+ /*
+ * Check the block's siblings; this function absorbs error codes
+ * for us.
+ */
+ return xfs_scrub_btree_block_check_siblings(bs, *pblock);
+}
+
+/*
+ * Check that the low and high keys of this block match the keys stored
+ * in the parent block.
+ */
+STATIC void
+xfs_scrub_btree_block_keys(
+ struct xfs_scrub_btree *bs,
+ int level,
+ struct xfs_btree_block *block)
+{
+ union xfs_btree_key block_keys;
+ struct xfs_btree_cur *cur = bs->cur;
+ union xfs_btree_key *high_bk;
+ union xfs_btree_key *parent_keys;
+ union xfs_btree_key *high_pk;
+ struct xfs_btree_block *parent_block;
+ struct xfs_buf *bp;
+
+ if (level >= cur->bc_nlevels - 1)
+ return;
+
+ /* Calculate the keys for this block. */
+ xfs_btree_get_keys(cur, block, &block_keys);
+
+ /* Obtain the parent's copy of the keys for this block. */
+ parent_block = xfs_btree_get_block(cur, level + 1, &bp);
+ parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_block);
+
+ if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Get high keys */
+ high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
+ high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1],
+ parent_block);
+
+ if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, cur, 1);
+}
+
+/*
+ * Visit all nodes and leaves of a btree. Check that all pointers and
+ * records are in order, that the keys reflect the records, and use a callback
+ * so that the caller can verify individual records.
+ */
+int
+xfs_scrub_btree(
+ struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur,
+ xfs_scrub_btree_rec_fn scrub_fn,
+ struct xfs_owner_info *oinfo,
+ void *private)
+{
+ struct xfs_scrub_btree bs = { NULL };
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ int level;
+ struct xfs_buf *bp;
+ int i;
+ int error = 0;
+
+ /* Initialize scrub state */
+ bs.cur = cur;
+ bs.scrub_rec = scrub_fn;
+ bs.oinfo = oinfo;
+ bs.firstrec = true;
+ bs.private = private;
+ bs.sc = sc;
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
+ bs.firstkey[i] = true;
+ INIT_LIST_HEAD(&bs.to_check);
+
+ /* Don't try to check a tree with a height we can't handle. */
+ if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) {
+ xfs_scrub_btree_set_corrupt(sc, cur, 0);
+ goto out;
+ }
+
+ /*
+ * Load the root of the btree. The helper function absorbs
+ * error codes for us.
+ */
+ level = cur->bc_nlevels - 1;
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ if (!xfs_scrub_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr))
+ goto out;
+ error = xfs_scrub_btree_get_block(&bs, level, &ptr, &block, &bp);
+ if (error || !block)
+ goto out;
+
+ cur->bc_ptrs[level] = 1;
+
+ while (level < cur->bc_nlevels) {
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ if (level == 0) {
+ /* End of leaf, pop back towards the root. */
+ if (cur->bc_ptrs[level] >
+ be16_to_cpu(block->bb_numrecs)) {
+ xfs_scrub_btree_block_keys(&bs, level, block);
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ /* Records in order for scrub? */
+ xfs_scrub_btree_rec(&bs);
+
+ /* Call out to the record checker. */
+ recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
+ error = bs.scrub_rec(&bs, recp);
+ if (error)
+ break;
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+
+ /* End of node, pop back towards the root. */
+ if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
+ xfs_scrub_btree_block_keys(&bs, level, block);
+ if (level < cur->bc_nlevels - 1)
+ cur->bc_ptrs[level + 1]++;
+ level++;
+ continue;
+ }
+
+ /* Keys in order for scrub? */
+ xfs_scrub_btree_key(&bs, level);
+
+ /* Drill another level deeper. */
+ pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
+ if (!xfs_scrub_btree_ptr_ok(&bs, level, pp)) {
+ cur->bc_ptrs[level]++;
+ continue;
+ }
+ level--;
+ error = xfs_scrub_btree_get_block(&bs, level, pp, &block, &bp);
+ if (error || !block)
+ goto out;
+
+ cur->bc_ptrs[level] = 1;
+ }
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
new file mode 100644
index 000000000000..4de825a626d1
--- /dev/null
+++ b/fs/xfs/scrub/btree.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_BTREE_H__
+#define __XFS_SCRUB_BTREE_H__
+
+/* btree scrub */
+
+/* Check for btree operation errors. */
+bool xfs_scrub_btree_process_error(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level, int *error);
+
+/* Check for btree corruption. */
+void xfs_scrub_btree_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_btree_cur *cur, int level);
+
+struct xfs_scrub_btree;
+typedef int (*xfs_scrub_btree_rec_fn)(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec);
+
+struct xfs_scrub_btree {
+ /* caller-provided scrub state */
+ struct xfs_scrub_context *sc;
+ struct xfs_btree_cur *cur;
+ xfs_scrub_btree_rec_fn scrub_rec;
+ struct xfs_owner_info *oinfo;
+ void *private;
+
+ /* internal scrub state */
+ union xfs_btree_rec lastrec;
+ bool firstrec;
+ union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
+ bool firstkey[XFS_BTREE_MAXLEVELS];
+ struct list_head to_check;
+};
+int xfs_scrub_btree(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ xfs_scrub_btree_rec_fn scrub_fn,
+ struct xfs_owner_info *oinfo, void *private);
+
+#endif /* __XFS_SCRUB_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
new file mode 100644
index 000000000000..ac95fe911d96
--- /dev/null
+++ b/fs/xfs/scrub/common.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/btree.h"
+
+/* Common code for the metadata scrubbers. */
+
+/*
+ * Handling operational errors.
+ *
+ * The *_process_error() family of functions are used to process error return
+ * codes from functions called as part of a scrub operation.
+ *
+ * If there's no error, we return true to tell the caller that it's ok
+ * to move on to the next check in its list.
+ *
+ * For non-verifier errors (e.g. ENOMEM) we return false to tell the
+ * caller that something bad happened, and we preserve *error so that
+ * the caller can return the *error up the stack to userspace.
+ *
+ * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
+ * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words,
+ * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
+ * not via return codes. We return false to tell the caller that
+ * something bad happened. Since the error has been cleared, the caller
+ * will (presumably) return that zero and scrubbing will move on to
+ * whatever's next.
+ *
+ * ftrace can be used to record the precise metadata location and the
+ * approximate code location of the failed operation.
+ */
+
+/* Check for operational errors. */
+bool
+xfs_scrub_process_error(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ int *error)
+{
+ switch (*error) {
+ case 0:
+ return true;
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_op_error(sc, agno, bno, *error,
+ __return_address);
+ break;
+ }
+ return false;
+}
+
+/* Check for operational errors for a file offset. */
+bool
+xfs_scrub_fblock_process_error(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset,
+ int *error)
+{
+ switch (*error) {
+ case 0:
+ return true;
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_file_op_error(sc, whichfork, offset, *error,
+ __return_address);
+ break;
+ }
+ return false;
+}
+
+/*
+ * Handling scrub corruption/optimization/warning checks.
+ *
+ * The *_set_{corrupt,preen,warning}() family of functions are used to
+ * record the presence of metadata that is incorrect (corrupt), could be
+ * optimized somehow (preen), or should be flagged for administrative
+ * review but is not incorrect (warn).
+ *
+ * ftrace can be used to record the precise metadata location and
+ * approximate code location of the failed check.
+ */
+
+/* Record a block which could be optimized. */
+void
+xfs_scrub_block_set_preen(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_block_preen(sc, bp->b_bn, __return_address);
+}
+
+/*
+ * Record an inode which could be optimized. The trace data will
+ * include the block given by bp if bp is given; otherwise it will use
+ * the block location of the inode record itself.
+ */
+void
+xfs_scrub_ino_set_preen(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
+ trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0,
+ __return_address);
+}
+
+/* Record a corrupt block. */
+void
+xfs_scrub_block_set_corrupt(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_block_error(sc, bp->b_bn, __return_address);
+}
+
+/*
+ * Record a corrupt inode. The trace data will include the block given
+ * by bp if bp is given; otherwise it will use the block location of the
+ * inode record itself.
+ */
+void
+xfs_scrub_ino_set_corrupt(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address);
+}
+
+/* Record corruption in a block indexed by a file fork. */
+void
+xfs_scrub_fblock_set_corrupt(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xfs_scrub_fblock_error(sc, whichfork, offset, __return_address);
+}
+
+/*
+ * Warn about inodes that need administrative review but is not
+ * incorrect.
+ */
+void
+xfs_scrub_ino_set_warning(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf *bp)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+ trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0,
+ __return_address);
+}
+
+/* Warn about a block indexed by a file fork that needs review. */
+void
+xfs_scrub_fblock_set_warning(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_fileoff_t offset)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
+ trace_xfs_scrub_fblock_warning(sc, whichfork, offset, __return_address);
+}
+
+/* Signal an incomplete scrub. */
+void
+xfs_scrub_set_incomplete(
+ struct xfs_scrub_context *sc)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
+ trace_xfs_scrub_incomplete(sc, __return_address);
+}
+
+/*
+ * AG scrubbing
+ *
+ * These helpers facilitate locking an allocation group's header
+ * buffers, setting up cursors for all btrees that are present, and
+ * cleaning everything up once we're through.
+ */
+
+/* Decide if we want to return an AG header read failure. */
+static inline bool
+want_ag_read_header_failure(
+ struct xfs_scrub_context *sc,
+ unsigned int type)
+{
+ /* Return all AG header read failures when scanning btrees. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
+ return true;
+ /*
+ * If we're scanning a given type of AG header, we only want to
+ * see read failures from that specific header. We'd like the
+ * other headers to cross-check them, but this isn't required.
+ */
+ if (sc->sm->sm_type == type)
+ return true;
+ return false;
+}
+
+/*
+ * Grab all the headers for an AG.
+ *
+ * The headers should be released by xfs_scrub_ag_free, but as a fail
+ * safe we attach all the buffers we grab to the scrub transaction so
+ * they'll all be freed when we cancel it.
+ */
+int
+xfs_scrub_ag_read_headers(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ struct xfs_buf **agi,
+ struct xfs_buf **agf,
+ struct xfs_buf **agfl)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, sc->tp, agno, agi);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+ goto out;
+
+ error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, agf);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+ goto out;
+
+ error = xfs_alloc_read_agfl(mp, sc->tp, agno, agfl);
+ if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+ goto out;
+
+out:
+ return error;
+}
+
+/* Release all the AG btree cursors. */
+void
+xfs_scrub_ag_btcur_free(
+ struct xfs_scrub_ag *sa)
+{
+ if (sa->refc_cur)
+ xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
+ if (sa->rmap_cur)
+ xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
+ if (sa->fino_cur)
+ xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
+ if (sa->ino_cur)
+ xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
+ if (sa->cnt_cur)
+ xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
+ if (sa->bno_cur)
+ xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
+
+ sa->refc_cur = NULL;
+ sa->rmap_cur = NULL;
+ sa->fino_cur = NULL;
+ sa->ino_cur = NULL;
+ sa->bno_cur = NULL;
+ sa->cnt_cur = NULL;
+}
+
+/* Initialize all the btree cursors for an AG. */
+int
+xfs_scrub_ag_btcur_init(
+ struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t agno = sa->agno;
+
+ if (sa->agf_bp) {
+ /* Set up a bnobt cursor for cross-referencing. */
+ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno, XFS_BTNUM_BNO);
+ if (!sa->bno_cur)
+ goto err;
+
+ /* Set up a cntbt cursor for cross-referencing. */
+ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno, XFS_BTNUM_CNT);
+ if (!sa->cnt_cur)
+ goto err;
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sa->agi_bp) {
+ sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ agno, XFS_BTNUM_INO);
+ if (!sa->ino_cur)
+ goto err;
+ }
+
+ /* Set up a finobt cursor for cross-referencing. */
+ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ agno, XFS_BTNUM_FINO);
+ if (!sa->fino_cur)
+ goto err;
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ agno);
+ if (!sa->rmap_cur)
+ goto err;
+ }
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) {
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, agno, NULL);
+ if (!sa->refc_cur)
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENOMEM;
+}
+
+/* Release the AG header context and btree cursors. */
+void
+xfs_scrub_ag_free(
+ struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa)
+{
+ xfs_scrub_ag_btcur_free(sa);
+ if (sa->agfl_bp) {
+ xfs_trans_brelse(sc->tp, sa->agfl_bp);
+ sa->agfl_bp = NULL;
+ }
+ if (sa->agf_bp) {
+ xfs_trans_brelse(sc->tp, sa->agf_bp);
+ sa->agf_bp = NULL;
+ }
+ if (sa->agi_bp) {
+ xfs_trans_brelse(sc->tp, sa->agi_bp);
+ sa->agi_bp = NULL;
+ }
+ sa->agno = NULLAGNUMBER;
+}
+
+/*
+ * For scrub, grab the AGI and the AGF headers, in that order. Locking
+ * order requires us to get the AGI before the AGF. We use the
+ * transaction to avoid deadlocking on crosslinked metadata buffers;
+ * either the caller passes one in (bmap scrub) or we have to create a
+ * transaction ourselves.
+ */
+int
+xfs_scrub_ag_init(
+ struct xfs_scrub_context *sc,
+ xfs_agnumber_t agno,
+ struct xfs_scrub_ag *sa)
+{
+ int error;
+
+ sa->agno = agno;
+ error = xfs_scrub_ag_read_headers(sc, agno, &sa->agi_bp,
+ &sa->agf_bp, &sa->agfl_bp);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_btcur_init(sc, sa);
+}
+
+/* Per-scrubber setup functions */
+
+/* Set us up with a transaction and an empty context. */
+int
+xfs_scrub_setup_fs(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+}
+
+/* Set us up with AG headers and btree cursors. */
+int
+xfs_scrub_setup_ag_btree(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip,
+ bool force_log)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /*
+ * If the caller asks us to checkpont the log, do so. This
+ * expensive operation should be performed infrequently and only
+ * as a last resort. Any caller that sets force_log should
+ * document why they need to do so.
+ */
+ if (force_log) {
+ error = xfs_scrub_checkpoint_log(mp);
+ if (error)
+ return error;
+ }
+
+ error = xfs_scrub_setup_ag_header(sc, ip);
+ if (error)
+ return error;
+
+ return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa);
+}
+
+/* Push everything out of the log onto disk. */
+int
+xfs_scrub_checkpoint_log(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ if (error)
+ return error;
+ xfs_ail_push_all_sync(mp->m_ail);
+ return 0;
+}
+
+/*
+ * Given an inode and the scrub control structure, grab either the
+ * inode referenced in the control structure or the inode passed in.
+ * The inode is not locked.
+ */
+int
+xfs_scrub_get_inode(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip_in)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = NULL;
+ int error;
+
+ /*
+ * If userspace passed us an AG number or a generation number
+ * without an inode number, they haven't got a clue so bail out
+ * immediately.
+ */
+ if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino))
+ return -EINVAL;
+
+ /* We want to scan the inode we already had opened. */
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+ sc->ip = ip_in;
+ return 0;
+ }
+
+ /* Look up the inode, see if the generation number matches. */
+ if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ return -ENOENT;
+ error = xfs_iget(mp, NULL, sc->sm->sm_ino,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
+ if (error == -ENOENT || error == -EINVAL) {
+ /* inode doesn't exist... */
+ return -ENOENT;
+ } else if (error) {
+ trace_xfs_scrub_op_error(sc,
+ XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
+ XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
+ return error;
+ }
+ if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+ iput(VFS_I(ip));
+ return -ENOENT;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
+/* Set us up to scrub a file's contents. */
+int
+xfs_scrub_setup_inode_contents(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip,
+ unsigned int resblks)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ error = xfs_scrub_get_inode(sc, ip);
+ if (error)
+ return error;
+
+ /* Got the inode, lock it and we're ready to go. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode for us */
+ return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
new file mode 100644
index 000000000000..5c043855570e
--- /dev/null
+++ b/fs/xfs/scrub/common.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_COMMON_H__
+#define __XFS_SCRUB_COMMON_H__
+
+/*
+ * We /could/ terminate a scrub/repair operation early. If we're not
+ * in a good place to continue (fatal signal, etc.) then bail out.
+ * Note that we're careful not to make any judgements about *error.
+ */
+static inline bool
+xfs_scrub_should_terminate(
+ struct xfs_scrub_context *sc,
+ int *error)
+{
+ if (fatal_signal_pending(current)) {
+ if (*error == 0)
+ *error = -EAGAIN;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ */
+static inline int
+xfs_scrub_trans_alloc(
+ struct xfs_scrub_metadata *sm,
+ struct xfs_mount *mp,
+ struct xfs_trans **tpp)
+{
+ return xfs_trans_alloc_empty(mp, tpp);
+}
+
+bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ xfs_agblock_t bno, int *error);
+bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, int *error);
+
+void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino,
+ struct xfs_buf *bp);
+
+void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc,
+ struct xfs_buf *bp);
+void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino,
+ struct xfs_buf *bp);
+void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset);
+
+void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino,
+ struct xfs_buf *bp);
+void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset);
+
+void xfs_scrub_set_incomplete(struct xfs_scrub_context *sc);
+int xfs_scrub_checkpoint_log(struct xfs_mount *mp);
+
+/* Setup functions */
+int xfs_scrub_setup_fs(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+int xfs_scrub_setup_ag_header(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_allocbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_iallocbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_rmapbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_ag_refcountbt(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode_bmap(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_inode_bmap_data(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_directory(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_xattr(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_symlink(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+int xfs_scrub_setup_parent(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip);
+#ifdef CONFIG_XFS_RT
+int xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+#else
+static inline int
+xfs_scrub_setup_rt(struct xfs_scrub_context *sc, struct xfs_inode *ip)
+{
+ return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+#else
+static inline int
+xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
+{
+ return -ENOENT;
+}
+#endif
+
+void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ struct xfs_buf **agi, struct xfs_buf **agf,
+ struct xfs_buf **agfl);
+void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa);
+int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
+ struct xfs_scrub_ag *sa);
+int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
+ int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno,
+ void *),
+ void *priv);
+
+int xfs_scrub_setup_ag_btree(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip, bool force_log);
+int xfs_scrub_get_inode(struct xfs_scrub_context *sc, struct xfs_inode *ip_in);
+int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc,
+ struct xfs_inode *ip, unsigned int resblks);
+
+#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
new file mode 100644
index 000000000000..d94edd93cba8
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Directory/Attribute Btree */
+
+/*
+ * Check for da btree operation errors. See the section about handling
+ * operational errors in common.c.
+ */
+bool
+xfs_scrub_da_process_error(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int *error)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ if (*error == 0)
+ return true;
+
+ switch (*error) {
+ case -EDEADLOCK:
+ /* Used to restart an op with deadlock avoidance. */
+ trace_xfs_scrub_deadlock_retry(sc->ip, sc->sm, *error);
+ break;
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Note the badness but don't abort. */
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ *error = 0;
+ /* fall through */
+ default:
+ trace_xfs_scrub_file_op_error(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ *error, __return_address);
+ break;
+ }
+ return false;
+}
+
+/*
+ * Check for da btree corruption. See the section about handling
+ * operational errors in common.c.
+ */
+void
+xfs_scrub_da_set_corrupt(
+ struct xfs_scrub_da_btree *ds,
+ int level)
+{
+ struct xfs_scrub_context *sc = ds->sc;
+
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ trace_xfs_scrub_fblock_error(sc, ds->dargs.whichfork,
+ xfs_dir2_da_to_db(ds->dargs.geo,
+ ds->state->path.blk[level].blkno),
+ __return_address);
+}
+
+/* Find an entry at a certain level in a da btree. */
+STATIC void *
+xfs_scrub_da_btree_entry(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int rec)
+{
+ char *ents;
+ struct xfs_da_state_blk *blk;
+ void *baddr;
+
+ /* Dispatch the entry finding function. */
+ blk = &ds->state->path.blk[level];
+ baddr = blk->bp->b_addr;
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ ents = (char *)xfs_attr3_leaf_entryp(baddr);
+ return ents + (rec * sizeof(struct xfs_attr_leaf_entry));
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
+ return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
+ return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr);
+ return ents + (rec * sizeof(struct xfs_da_node_entry));
+ }
+
+ return NULL;
+}
+
+/* Scrub a da btree hash (key). */
+int
+xfs_scrub_da_btree_hash(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ __be32 *hashp)
+{
+ struct xfs_da_state_blk *blks;
+ struct xfs_da_node_entry *entry;
+ xfs_dahash_t hash;
+ xfs_dahash_t parent_hash;
+
+ /* Is this hash in order? */
+ hash = be32_to_cpu(*hashp);
+ if (hash < ds->hashes[level])
+ xfs_scrub_da_set_corrupt(ds, level);
+ ds->hashes[level] = hash;
+
+ if (level == 0)
+ return 0;
+
+ /* Is this hash no larger than the parent hash? */
+ blks = ds->state->path.blk;
+ entry = xfs_scrub_da_btree_entry(ds, level - 1, blks[level - 1].index);
+ parent_hash = be32_to_cpu(entry->hashval);
+ if (parent_hash < hash)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ return 0;
+}
+
+/*
+ * Check a da btree pointer. Returns true if it's ok to use this
+ * pointer.
+ */
+STATIC bool
+xfs_scrub_da_btree_ptr_ok(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ xfs_dablk_t blkno)
+{
+ if (blkno < ds->lowest || (ds->highest != 0 && blkno >= ds->highest)) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * The da btree scrubber can handle leaf1 blocks as a degenerate
+ * form of leafn blocks. Since the regular da code doesn't handle
+ * leaf1, we must multiplex the verifiers.
+ */
+static void
+xfs_scrub_da_btree_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ /*
+ * xfs_da3_node_buf_ops already know how to handle
+ * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+ */
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ }
+}
+static void
+xfs_scrub_da_btree_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ bp->b_ops->verify_write(bp);
+ return;
+ default:
+ /*
+ * xfs_da3_node_buf_ops already know how to handle
+ * DA*_NODE, ATTR*_LEAF, and DIR*_LEAFN blocks.
+ */
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ bp->b_ops->verify_write(bp);
+ return;
+ }
+}
+
+static const struct xfs_buf_ops xfs_scrub_da_btree_buf_ops = {
+ .name = "xfs_scrub_da_btree",
+ .verify_read = xfs_scrub_da_btree_read_verify,
+ .verify_write = xfs_scrub_da_btree_write_verify,
+};
+
+/* Check a block's sibling. */
+STATIC int
+xfs_scrub_da_btree_block_check_sibling(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ int direction,
+ xfs_dablk_t sibling)
+{
+ int retval;
+ int error;
+
+ memcpy(&ds->state->altpath, &ds->state->path,
+ sizeof(ds->state->altpath));
+
+ /*
+ * If the pointer is null, we shouldn't be able to move the upper
+ * level pointer anywhere.
+ */
+ if (sibling == 0) {
+ error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
+ direction, false, &retval);
+ if (error == 0 && retval == 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ error = 0;
+ goto out;
+ }
+
+ /* Move the alternate cursor one block in the direction given. */
+ error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
+ direction, false, &retval);
+ if (!xfs_scrub_da_process_error(ds, level, &error))
+ return error;
+ if (retval) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ return error;
+ }
+
+ /* Compare upper level pointer to sibling pointer. */
+ if (ds->state->altpath.blk[level].blkno != sibling)
+ xfs_scrub_da_set_corrupt(ds, level);
+ xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+out:
+ return error;
+}
+
+/* Check a block's sibling pointers. */
+STATIC int
+xfs_scrub_da_btree_block_check_siblings(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ struct xfs_da_blkinfo *hdr)
+{
+ xfs_dablk_t forw;
+ xfs_dablk_t back;
+ int error = 0;
+
+ forw = be32_to_cpu(hdr->forw);
+ back = be32_to_cpu(hdr->back);
+
+ /* Top level blocks should not have sibling pointers. */
+ if (level == 0) {
+ if (forw != 0 || back != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ return 0;
+ }
+
+ /*
+ * Check back (left) and forw (right) pointers. These functions
+ * absorb error codes for us.
+ */
+ error = xfs_scrub_da_btree_block_check_sibling(ds, level, 0, back);
+ if (error)
+ goto out;
+ error = xfs_scrub_da_btree_block_check_sibling(ds, level, 1, forw);
+
+out:
+ memset(&ds->state->altpath, 0, sizeof(ds->state->altpath));
+ return error;
+}
+
+/* Load a dir/attribute block from a btree. */
+STATIC int
+xfs_scrub_da_btree_block(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ xfs_dablk_t blkno)
+{
+ struct xfs_da_state_blk *blk;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_da3_blkinfo *hdr3;
+ struct xfs_da_args *dargs = &ds->dargs;
+ struct xfs_inode *ip = ds->dargs.dp;
+ xfs_ino_t owner;
+ int *pmaxrecs;
+ struct xfs_da3_icnode_hdr nodehdr;
+ int error = 0;
+
+ blk = &ds->state->path.blk[level];
+ ds->state->path.active = level + 1;
+
+ /* Release old block. */
+ if (blk->bp) {
+ xfs_trans_brelse(dargs->trans, blk->bp);
+ blk->bp = NULL;
+ }
+
+ /* Check the pointer. */
+ blk->blkno = blkno;
+ if (!xfs_scrub_da_btree_ptr_ok(ds, level, blkno))
+ goto out_nobuf;
+
+ /* Read the buffer. */
+ error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2,
+ &blk->bp, dargs->whichfork,
+ &xfs_scrub_da_btree_buf_ops);
+ if (!xfs_scrub_da_process_error(ds, level, &error))
+ goto out_nobuf;
+
+ /*
+ * We didn't find a dir btree root block, which means that
+ * there's no LEAF1/LEAFN tree (at least not where it's supposed
+ * to be), so jump out now.
+ */
+ if (ds->dargs.whichfork == XFS_DATA_FORK && level == 0 &&
+ blk->bp == NULL)
+ goto out_nobuf;
+
+ /* It's /not/ ok for attr trees not to have a da btree. */
+ if (blk->bp == NULL) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_nobuf;
+ }
+
+ hdr3 = blk->bp->b_addr;
+ blk->magic = be16_to_cpu(hdr3->hdr.magic);
+ pmaxrecs = &ds->maxrecs[level];
+
+ /* We only started zeroing the header on v5 filesystems. */
+ if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad)
+ xfs_scrub_da_set_corrupt(ds, level);
+
+ /* Check the owner. */
+ if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) {
+ owner = be64_to_cpu(hdr3->owner);
+ if (owner != ip->i_ino)
+ xfs_scrub_da_set_corrupt(ds, level);
+ }
+
+ /* Check the siblings. */
+ error = xfs_scrub_da_btree_block_check_siblings(ds, level, &hdr3->hdr);
+ if (error)
+ goto out;
+
+ /* Interpret the buffer. */
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_ATTR_LEAF_BUF);
+ blk->magic = XFS_ATTR_LEAF_MAGIC;
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DIR_LEAFN_BUF);
+ blk->magic = XFS_DIR2_LEAFN_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DIR2_LEAF1_MAGIC:
+ case XFS_DIR3_LEAF1_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DIR_LEAF1_BUF);
+ blk->magic = XFS_DIR2_LEAF1_MAGIC;
+ blk->hashval = xfs_dir2_leaf_lasthash(ip, blk->bp, pmaxrecs);
+ if (ds->tree_level != 0)
+ xfs_scrub_da_set_corrupt(ds, level);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ xfs_trans_buf_set_type(dargs->trans, blk->bp,
+ XFS_BLFT_DA_NODE_BUF);
+ blk->magic = XFS_DA_NODE_MAGIC;
+ node = blk->bp->b_addr;
+ ip->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = ip->d_ops->node_tree_p(node);
+ *pmaxrecs = nodehdr.count;
+ blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval);
+ if (level == 0) {
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ ds->tree_level = nodehdr.level;
+ } else {
+ if (ds->tree_level != nodehdr.level) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
+ /* XXX: Check hdr3.pad32 once we know how to fix it. */
+ break;
+ default:
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+
+out:
+ return error;
+out_freebp:
+ xfs_trans_brelse(dargs->trans, blk->bp);
+ blk->bp = NULL;
+out_nobuf:
+ blk->blkno = 0;
+ return error;
+}
+
+/* Visit all nodes and leaves of a da btree. */
+int
+xfs_scrub_da_btree(
+ struct xfs_scrub_context *sc,
+ int whichfork,
+ xfs_scrub_da_btree_rec_fn scrub_fn,
+ void *private)
+{
+ struct xfs_scrub_da_btree ds = {};
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_da_state_blk *blks;
+ struct xfs_da_node_entry *key;
+ void *rec;
+ xfs_dablk_t blkno;
+ int level;
+ int error;
+
+ /* Skip short format data structures; no btree to scan. */
+ if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ return 0;
+
+ /* Set up initial da state. */
+ ds.dargs.dp = sc->ip;
+ ds.dargs.whichfork = whichfork;
+ ds.dargs.trans = sc->tp;
+ ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
+ ds.state = xfs_da_state_alloc();
+ ds.state->args = &ds.dargs;
+ ds.state->mp = mp;
+ ds.sc = sc;
+ ds.private = private;
+ if (whichfork == XFS_ATTR_FORK) {
+ ds.dargs.geo = mp->m_attr_geo;
+ ds.lowest = 0;
+ ds.highest = 0;
+ } else {
+ ds.dargs.geo = mp->m_dir_geo;
+ ds.lowest = ds.dargs.geo->leafblk;
+ ds.highest = ds.dargs.geo->freeblk;
+ }
+ blkno = ds.lowest;
+ level = 0;
+
+ /* Find the root of the da tree, if present. */
+ blks = ds.state->path.blk;
+ error = xfs_scrub_da_btree_block(&ds, level, blkno);
+ if (error)
+ goto out_state;
+ /*
+ * We didn't find a block at ds.lowest, which means that there's
+ * no LEAF1/LEAFN tree (at least not where it's supposed to be),
+ * so jump out now.
+ */
+ if (blks[level].bp == NULL)
+ goto out_state;
+
+ blks[level].index = 0;
+ while (level >= 0 && level < XFS_DA_NODE_MAXDEPTH) {
+ /* Handle leaf block. */
+ if (blks[level].magic != XFS_DA_NODE_MAGIC) {
+ /* End of leaf, pop back towards the root. */
+ if (blks[level].index >= ds.maxrecs[level]) {
+ if (level > 0)
+ blks[level - 1].index++;
+ ds.tree_level++;
+ level--;
+ continue;
+ }
+
+ /* Dispatch record scrubbing. */
+ rec = xfs_scrub_da_btree_entry(&ds, level,
+ blks[level].index);
+ error = scrub_fn(&ds, level, rec);
+ if (error)
+ break;
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ blks[level].index++;
+ continue;
+ }
+
+
+ /* End of node, pop back towards the root. */
+ if (blks[level].index >= ds.maxrecs[level]) {
+ if (level > 0)
+ blks[level - 1].index++;
+ ds.tree_level++;
+ level--;
+ continue;
+ }
+
+ /* Hashes in order for scrub? */
+ key = xfs_scrub_da_btree_entry(&ds, level, blks[level].index);
+ error = xfs_scrub_da_btree_hash(&ds, level, &key->hashval);
+ if (error)
+ goto out;
+
+ /* Drill another level deeper. */
+ blkno = be32_to_cpu(key->before);
+ level++;
+ ds.tree_level--;
+ error = xfs_scrub_da_btree_block(&ds, level, blkno);
+ if (error)
+ goto out;
+ if (blks[level].bp == NULL)
+ goto out;
+
+ blks[level].index = 0;
+ }
+
+out:
+ /* Release all the buffers we're tracking. */
+ for (level = 0; level < XFS_DA_NODE_MAXDEPTH; level++) {
+ if (blks[level].bp == NULL)
+ continue;
+ xfs_trans_brelse(sc->tp, blks[level].bp);
+ blks[level].bp = NULL;
+ }
+
+out_state:
+ xfs_da_state_free(ds.state);
+ return error;
+}
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
new file mode 100644
index 000000000000..d31468d68cef
--- /dev/null
+++ b/fs/xfs/scrub/dabtree.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_DABTREE_H__
+#define __XFS_SCRUB_DABTREE_H__
+
+/* dir/attr btree */
+
+struct xfs_scrub_da_btree {
+ struct xfs_da_args dargs;
+ xfs_dahash_t hashes[XFS_DA_NODE_MAXDEPTH];
+ int maxrecs[XFS_DA_NODE_MAXDEPTH];
+ struct xfs_da_state *state;
+ struct xfs_scrub_context *sc;
+ void *private;
+
+ /*
+ * Lowest and highest directory block address in which we expect
+ * to find dir/attr btree node blocks. For a directory this
+ * (presumably) means between LEAF_OFFSET and FREE_OFFSET; for
+ * attributes there is no limit.
+ */
+ xfs_dablk_t lowest;
+ xfs_dablk_t highest;
+
+ int tree_level;
+};
+
+typedef int (*xfs_scrub_da_btree_rec_fn)(struct xfs_scrub_da_btree *ds,
+ int level, void *rec);
+
+/* Check for da btree operation errors. */
+bool xfs_scrub_da_process_error(struct xfs_scrub_da_btree *ds, int level, int *error);
+
+/* Check for da btree corruption. */
+void xfs_scrub_da_set_corrupt(struct xfs_scrub_da_btree *ds, int level);
+
+int xfs_scrub_da_btree_hash(struct xfs_scrub_da_btree *ds, int level,
+ __be32 *hashp);
+int xfs_scrub_da_btree(struct xfs_scrub_context *sc, int whichfork,
+ xfs_scrub_da_btree_rec_fn scrub_fn, void *private);
+
+#endif /* __XFS_SCRUB_DABTREE_H__ */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
new file mode 100644
index 000000000000..69e1efdd4019
--- /dev/null
+++ b/fs/xfs/scrub/dir.c
@@ -0,0 +1,816 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/dabtree.h"
+
+/* Set us up to scrub directories. */
+int
+xfs_scrub_setup_directory(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Directories */
+
+/* Scrub a directory entry. */
+
+struct xfs_scrub_dir_ctx {
+ /* VFS fill-directory iterator */
+ struct dir_context dir_iter;
+
+ struct xfs_scrub_context *sc;
+};
+
+/* Check that an inode's mode matches a given DT_ type. */
+STATIC int
+xfs_scrub_dir_check_ftype(
+ struct xfs_scrub_dir_ctx *sdc,
+ xfs_fileoff_t offset,
+ xfs_ino_t inum,
+ int dtype)
+{
+ struct xfs_mount *mp = sdc->sc->mp;
+ struct xfs_inode *ip;
+ int ino_dtype;
+ int error = 0;
+
+ if (!xfs_sb_version_hasftype(&mp->m_sb)) {
+ if (dtype != DT_UNKNOWN && dtype != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ goto out;
+ }
+
+ /*
+ * Grab the inode pointed to by the dirent. We release the
+ * inode before we cancel the scrub transaction. Since we're
+ * don't know a priori that releasing the inode won't trigger
+ * eofblocks cleanup (which allocates what would be a nested
+ * transaction), we can't use DONTCACHE here because DONTCACHE
+ * inodes can trigger immediate inactive cleanup of the inode.
+ */
+ error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
+ if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
+ &error))
+ goto out;
+
+ /* Convert mode to the DT_* values that dir_emit uses. */
+ ino_dtype = xfs_dir3_get_dtype(mp,
+ xfs_mode_to_ftype(VFS_I(ip)->i_mode));
+ if (ino_dtype != dtype)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ iput(VFS_I(ip));
+out:
+ return error;
+}
+
+/*
+ * Scrub a single directory entry.
+ *
+ * We use the VFS directory iterator (i.e. readdir) to call this
+ * function for every directory entry in a directory. Once we're here,
+ * we check the inode number to make sure it's sane, then we check that
+ * we can look up this filename. Finally, we check the ftype.
+ */
+STATIC int
+xfs_scrub_dir_actor(
+ struct dir_context *dir_iter,
+ const char *name,
+ int namelen,
+ loff_t pos,
+ u64 ino,
+ unsigned type)
+{
+ struct xfs_mount *mp;
+ struct xfs_inode *ip;
+ struct xfs_scrub_dir_ctx *sdc;
+ struct xfs_name xname;
+ xfs_ino_t lookup_ino;
+ xfs_dablk_t offset;
+ int error = 0;
+
+ sdc = container_of(dir_iter, struct xfs_scrub_dir_ctx, dir_iter);
+ ip = sdc->sc->ip;
+ mp = ip->i_mount;
+ offset = xfs_dir2_db_to_da(mp->m_dir_geo,
+ xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+
+ /* Does this inode number make sense? */
+ if (!xfs_verify_dir_ino(mp, ino)) {
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
+ if (!strncmp(".", name, namelen)) {
+ /* If this is "." then check that the inum matches the dir. */
+ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ if (ino != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ } else if (!strncmp("..", name, namelen)) {
+ /*
+ * If this is ".." in the root inode, check that the inum
+ * matches this dir.
+ */
+ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
+ offset);
+ }
+
+ /* Verify that we can look up this name by hash. */
+ xname.name = name;
+ xname.len = namelen;
+ xname.type = XFS_DIR3_FT_UNKNOWN;
+
+ error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+ if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
+ &error))
+ goto fail_xref;
+ if (lookup_ino != ino) {
+ xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
+ /* Verify the file type. This function absorbs error codes. */
+ error = xfs_scrub_dir_check_ftype(sdc, offset, lookup_ino, type);
+ if (error)
+ goto out;
+out:
+ return error;
+fail_xref:
+ return error;
+}
+
+/* Scrub a directory btree record. */
+STATIC int
+xfs_scrub_dir_rec(
+ struct xfs_scrub_da_btree *ds,
+ int level,
+ void *rec)
+{
+ struct xfs_mount *mp = ds->state->mp;
+ struct xfs_dir2_leaf_entry *ent = rec;
+ struct xfs_inode *dp = ds->dargs.dp;
+ struct xfs_dir2_data_entry *dent;
+ struct xfs_buf *bp;
+ xfs_ino_t ino;
+ xfs_dablk_t rec_bno;
+ xfs_dir2_db_t db;
+ xfs_dir2_data_aoff_t off;
+ xfs_dir2_dataptr_t ptr;
+ xfs_dahash_t calc_hash;
+ xfs_dahash_t hash;
+ unsigned int tag;
+ int error;
+
+ /* Check the hash of the entry. */
+ error = xfs_scrub_da_btree_hash(ds, level, &ent->hashval);
+ if (error)
+ goto out;
+
+ /* Valid hash pointer? */
+ ptr = be32_to_cpu(ent->address);
+ if (ptr == 0)
+ return 0;
+
+ /* Find the directory entry's location. */
+ db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr);
+ off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr);
+ rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db);
+
+ if (rec_bno >= mp->m_dir_geo->leafblk) {
+ xfs_scrub_da_set_corrupt(ds, level);
+ goto out;
+ }
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp);
+ if (!xfs_scrub_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
+ &error))
+ goto out;
+ if (!bp) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out;
+ }
+
+ /* Retrieve the entry, sanity check it, and compare hashes. */
+ dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
+ ino = be64_to_cpu(dent->inumber);
+ hash = be32_to_cpu(ent->hashval);
+ tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent));
+ if (!xfs_verify_dir_ino(mp, ino) || tag != off)
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ if (dent->namelen == 0) {
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
+ calc_hash = xfs_da_hashname(dent->name, dent->namelen);
+ if (calc_hash != hash)
+ xfs_scrub_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+
+out_relse:
+ xfs_trans_brelse(ds->dargs.trans, bp);
+out:
+ return error;
+}
+
+/*
+ * Is this unused entry either in the bestfree or smaller than all of
+ * them? We've already checked that the bestfrees are sorted longest to
+ * shortest, and that there aren't any bogus entries.
+ */
+STATIC void
+xfs_scrub_directory_check_free_entry(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup)
+{
+ struct xfs_dir2_data_free *dfp;
+ unsigned int dup_length;
+
+ dup_length = be16_to_cpu(dup->length);
+
+ /* Unused entry is shorter than any of the bestfrees */
+ if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ return;
+
+ for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--)
+ if (dup_length == be16_to_cpu(dfp->length))
+ return;
+
+ /* Unused entry should be in the bestfrees but wasn't found. */
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory data block. */
+STATIC int
+xfs_scrub_directory_data_bestfree(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ bool is_block)
+{
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_free *dfp;
+ struct xfs_buf *bp;
+ struct xfs_dir2_data_free *bf;
+ struct xfs_mount *mp = sc->mp;
+ const struct xfs_dir_ops *d_ops;
+ char *ptr;
+ char *endptr;
+ u16 tag;
+ unsigned int nr_bestfrees = 0;
+ unsigned int nr_frees = 0;
+ unsigned int smallest_bestfree;
+ int newlen;
+ int offset;
+ int error;
+
+ d_ops = sc->ip->d_ops;
+
+ if (is_block) {
+ /* dir block format */
+ if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
+ } else {
+ /* dir data format */
+ error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp);
+ }
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+
+ /* Do the bestfrees correspond to actual free space? */
+ bf = d_ops->data_bestfree_p(bp->b_addr);
+ smallest_bestfree = UINT_MAX;
+ for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
+ offset = be16_to_cpu(dfp->offset);
+ if (offset == 0)
+ continue;
+ if (offset >= mp->m_dir_geo->blksize) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+ dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset);
+ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+
+ /* bestfree doesn't match the entry it points at? */
+ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
+ be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
+ tag != ((char *)dup - (char *)bp->b_addr)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+
+ /* bestfree records should be ordered largest to smallest */
+ if (smallest_bestfree < be16_to_cpu(dfp->length)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+
+ smallest_bestfree = be16_to_cpu(dfp->length);
+ nr_bestfrees++;
+ }
+
+ /* Make sure the bestfrees are actually the best free spaces. */
+ ptr = (char *)d_ops->data_entry_p(bp->b_addr);
+ if (is_block) {
+ struct xfs_dir2_block_tail *btp;
+
+ btp = xfs_dir2_block_tail_p(mp->m_dir_geo, bp->b_addr);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+ } else
+ endptr = (char *)bp->b_addr + BBTOB(bp->b_length);
+
+ /* Iterate the entries, stopping when we hit or go past the end. */
+ while (ptr < endptr) {
+ dup = (struct xfs_dir2_data_unused *)ptr;
+ /* Skip real entries */
+ if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
+ struct xfs_dir2_data_entry *dep;
+
+ dep = (struct xfs_dir2_data_entry *)ptr;
+ newlen = d_ops->data_entsize(dep->namelen);
+ if (newlen <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ lblk);
+ goto out_buf;
+ }
+ ptr += newlen;
+ continue;
+ }
+
+ /* Spot check this free entry */
+ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
+ if (tag != ((char *)dup - (char *)bp->b_addr))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /*
+ * Either this entry is a bestfree or it's smaller than
+ * any of the bestfrees.
+ */
+ xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup);
+
+ /* Move on. */
+ newlen = be16_to_cpu(dup->length);
+ if (newlen <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
+ ptr += newlen;
+ if (ptr <= endptr)
+ nr_frees++;
+ }
+
+ /* We're required to fill all the space. */
+ if (ptr != endptr)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /* Did we see at least as many free slots as there are bestfrees? */
+ if (nr_frees < nr_bestfrees)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out_buf:
+ xfs_trans_brelse(sc->tp, bp);
+out:
+ return error;
+}
+
+/*
+ * Does the free space length in the free space index block ($len) match
+ * the longest length in the directory data block's bestfree array?
+ * Assume that we've already checked that the data block's bestfree
+ * array is in order.
+ */
+STATIC void
+xfs_scrub_directory_check_freesp(
+ struct xfs_scrub_context *sc,
+ xfs_dablk_t lblk,
+ struct xfs_buf *dbp,
+ unsigned int len)
+{
+ struct xfs_dir2_data_free *dfp;
+
+ dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr);
+
+ if (len != be16_to_cpu(dfp->length))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ if (len > 0 && be16_to_cpu(dfp->offset) == 0)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+}
+
+/* Check free space info in a directory leaf1 block. */
+STATIC int
+xfs_scrub_directory_leaf1_bestfree(
+ struct xfs_scrub_context *sc,
+ struct xfs_da_args *args,
+ xfs_dablk_t lblk)
+{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_leaf_tail *ltp;
+ struct xfs_dir2_leaf *leaf;
+ struct xfs_buf *dbp;
+ struct xfs_buf *bp;
+ const struct xfs_dir_ops *d_ops = sc->ip->d_ops;
+ struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
+ __be16 *bestp;
+ __u16 best;
+ __u32 hash;
+ __u32 lasthash = 0;
+ __u32 bestcount;
+ unsigned int stale = 0;
+ int i;
+ int error;
+
+ /* Read the free space block. */
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ leaf = bp->b_addr;
+ d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = d_ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
+ bestcount = be32_to_cpu(ltp->bestcount);
+ bestp = xfs_dir2_leaf_bests_p(ltp);
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad != cpu_to_be32(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ }
+
+ /*
+ * There should be as many bestfree slots as there are dir data
+ * blocks that can fit under i_size.
+ */
+ if (bestcount != xfs_dir2_byte_to_db(geo, sc->ip->i_d.di_size)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Is the leaf count even remotely sane? */
+ if (leafhdr.count > d_ops->leaf_max_ents(geo)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Leaves and bests don't overlap in leaf format. */
+ if ((char *)&ents[leafhdr.count] > (char *)bestp) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /* Check hash value order, count stale entries. */
+ for (i = 0; i < leafhdr.count; i++) {
+ hash = be32_to_cpu(ents[i].hashval);
+ if (i > 0 && lasthash > hash)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ lasthash = hash;
+ if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ stale++;
+ }
+ if (leafhdr.stale != stale)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+
+ /* Check all the bestfree entries. */
+ for (i = 0; i < bestcount; i++, bestp++) {
+ best = be16_to_cpu(*bestp);
+ if (best == NULLDATAOFF)
+ continue;
+ error = xfs_dir3_data_read(sc->tp, sc->ip,
+ i * args->geo->fsbcount, -1, &dbp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+ &error))
+ continue;
+ xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
+ xfs_trans_brelse(sc->tp, dbp);
+ }
+out:
+ return error;
+}
+
+/* Check free space info in a directory freespace block. */
+STATIC int
+xfs_scrub_directory_free_bestfree(
+ struct xfs_scrub_context *sc,
+ struct xfs_da_args *args,
+ xfs_dablk_t lblk)
+{
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_buf *dbp;
+ struct xfs_buf *bp;
+ __be16 *bestp;
+ __u16 best;
+ unsigned int stale = 0;
+ int i;
+ int error;
+
+ /* Read the free space block */
+ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
+ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+ if (hdr3->pad != cpu_to_be32(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ }
+
+ /* Check all the entries. */
+ sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr);
+ bestp = sc->ip->d_ops->free_bests_p(bp->b_addr);
+ for (i = 0; i < freehdr.nvalid; i++, bestp++) {
+ best = be16_to_cpu(*bestp);
+ if (best == NULLDATAOFF) {
+ stale++;
+ continue;
+ }
+ error = xfs_dir3_data_read(sc->tp, sc->ip,
+ (freehdr.firstdb + i) * args->geo->fsbcount,
+ -1, &dbp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
+ &error))
+ continue;
+ xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
+ xfs_trans_brelse(sc->tp, dbp);
+ }
+
+ if (freehdr.nused + stale != freehdr.nvalid)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+out:
+ return error;
+}
+
+/* Check free space information in directories. */
+STATIC int
+xfs_scrub_directory_blocks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_da_args args;
+ struct xfs_ifork *ifp;
+ struct xfs_mount *mp = sc->mp;
+ xfs_fileoff_t leaf_lblk;
+ xfs_fileoff_t free_lblk;
+ xfs_fileoff_t lblk;
+ struct xfs_iext_cursor icur;
+ xfs_dablk_t dabno;
+ bool found;
+ int is_block = 0;
+ int error;
+
+ /* Ignore local format directories. */
+ if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+ sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ return 0;
+
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
+ leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
+ free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
+
+ /* Is this a block dir? */
+ args.dp = sc->ip;
+ args.geo = mp->m_dir_geo;
+ args.trans = sc->tp;
+ error = xfs_dir2_isblock(&args, &is_block);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
+ goto out;
+
+ /* Iterate all the data extents in the directory... */
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ while (found) {
+ /* Block directories only have a single block at offset 0. */
+ if (is_block &&
+ (got.br_startoff > 0 ||
+ got.br_blockcount != args.geo->fsbcount)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ got.br_startoff);
+ break;
+ }
+
+ /* No more data blocks... */
+ if (got.br_startoff >= leaf_lblk)
+ break;
+
+ /*
+ * Check each data block's bestfree data.
+ *
+ * Iterate all the fsbcount-aligned block offsets in
+ * this directory. The directory block reading code is
+ * smart enough to do its own bmap lookups to handle
+ * discontiguous directory blocks. When we're done
+ * with the extent record, re-query the bmap at the
+ * next fsbcount-aligned offset to avoid redundant
+ * block checks.
+ */
+ for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+ args.geo->fsbcount);
+ lblk < got.br_startoff + got.br_blockcount;
+ lblk += args.geo->fsbcount) {
+ error = xfs_scrub_directory_data_bestfree(sc, lblk,
+ is_block);
+ if (error)
+ goto out;
+ }
+ dabno = got.br_startoff + got.br_blockcount;
+ lblk = roundup(dabno, args.geo->fsbcount);
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Look for a leaf1 block, which has free info. */
+ if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) &&
+ got.br_startoff == leaf_lblk &&
+ got.br_blockcount == args.geo->fsbcount &&
+ !xfs_iext_next_extent(ifp, &icur, &got)) {
+ if (is_block) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+ error = xfs_scrub_directory_leaf1_bestfree(sc, &args,
+ leaf_lblk);
+ if (error)
+ goto out;
+ }
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Scan for free blocks */
+ lblk = free_lblk;
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ while (found) {
+ /*
+ * Dirs can't have blocks mapped above 2^32.
+ * Single-block dirs shouldn't even be here.
+ */
+ lblk = got.br_startoff;
+ if (lblk & ~0xFFFFFFFFULL) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+ if (is_block) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out;
+ }
+
+ /*
+ * Check each dir free block's bestfree data.
+ *
+ * Iterate all the fsbcount-aligned block offsets in
+ * this directory. The directory block reading code is
+ * smart enough to do its own bmap lookups to handle
+ * discontiguous directory blocks. When we're done
+ * with the extent record, re-query the bmap at the
+ * next fsbcount-aligned offset to avoid redundant
+ * block checks.
+ */
+ for (lblk = roundup((xfs_dablk_t)got.br_startoff,
+ args.geo->fsbcount);
+ lblk < got.br_startoff + got.br_blockcount;
+ lblk += args.geo->fsbcount) {
+ error = xfs_scrub_directory_free_bestfree(sc, &args,
+ lblk);
+ if (error)
+ goto out;
+ }
+ dabno = got.br_startoff + got.br_blockcount;
+ lblk = roundup(dabno, args.geo->fsbcount);
+ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
+ }
+out:
+ return error;
+}
+
+/* Scrub a whole directory. */
+int
+xfs_scrub_directory(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_dir_ctx sdc = {
+ .dir_iter.actor = xfs_scrub_dir_actor,
+ .dir_iter.pos = 0,
+ .sc = sc,
+ };
+ size_t bufsize;
+ loff_t oldpos;
+ int error = 0;
+
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ /* Plausible size? */
+ if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ goto out;
+ }
+
+ /* Check directory tree structure */
+ error = xfs_scrub_da_btree(sc, XFS_DATA_FORK, xfs_scrub_dir_rec, NULL);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+
+ /* Check the freespace. */
+ error = xfs_scrub_directory_blocks(sc);
+ if (error)
+ return error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return error;
+
+ /*
+ * Check that every dirent we see can also be looked up by hash.
+ * Userspace usually asks for a 32k buffer, so we will too.
+ */
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+ sc->ip->i_d.di_size);
+
+ /*
+ * Look up every name in this directory by hash.
+ *
+ * Use the xfs_readdir function to call xfs_scrub_dir_actor on
+ * every directory entry in this directory. In _actor, we check
+ * the name, inode number, and ftype (if applicable) of the
+ * entry. xfs_readdir uses the VFS filldir functions to provide
+ * iteration context.
+ *
+ * The VFS grabs a read or write lock via i_rwsem before it reads
+ * or writes to a directory. If we've gotten this far we've
+ * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+ * getting a write lock on i_rwsem. Therefore, it is safe for us
+ * to drop the ILOCK here in order to reuse the _readdir and
+ * _dir_lookup routines, which do their own ILOCK locking.
+ */
+ oldpos = 0;
+ sc->ilock_flags &= ~XFS_ILOCK_EXCL;
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ while (true) {
+ error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ goto out;
+ if (oldpos == sdc.dir_iter.pos)
+ break;
+ oldpos = sdc.dir_iter.pos;
+ }
+
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
new file mode 100644
index 000000000000..496d6f2fbb9e
--- /dev/null
+++ b/fs/xfs/scrub/ialloc.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub inode btrees.
+ * If we detect a discrepancy between the inobt and the inode,
+ * try again after forcing logged inode cores out to disk.
+ */
+int
+xfs_scrub_setup_ag_iallocbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder);
+}
+
+/* Inode btree scrubber. */
+
+/* Is this chunk worth checking? */
+STATIC bool
+xfs_scrub_iallocbt_chunk(
+ struct xfs_scrub_btree *bs,
+ struct xfs_inobt_rec_incore *irec,
+ xfs_agino_t agino,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+
+ bno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ return true;
+}
+
+/* Count the number of free inodes. */
+static unsigned int
+xfs_scrub_iallocbt_freecount(
+ xfs_inofree_t freemask)
+{
+ BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64));
+ return hweight64(freemask);
+}
+
+/* Check a particular inode with ir_free. */
+STATIC int
+xfs_scrub_iallocbt_check_cluster_freemask(
+ struct xfs_scrub_btree *bs,
+ xfs_ino_t fsino,
+ xfs_agino_t chunkino,
+ xfs_agino_t clusterino,
+ struct xfs_inobt_rec_incore *irec,
+ struct xfs_buf *bp)
+{
+ struct xfs_dinode *dip;
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ bool inode_is_free = false;
+ bool freemask_ok;
+ bool inuse;
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(bs->sc, &error))
+ return error;
+
+ dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
+ (dip->di_version >= 3 &&
+ be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ goto out;
+ }
+
+ if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
+ inode_is_free = true;
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
+ fsino + clusterino, &inuse);
+ if (error == -ENODATA) {
+ /* Not cached, just read the disk buffer */
+ freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ if (!bs->sc->try_harder && !freemask_ok)
+ return -EDEADLOCK;
+ } else if (error < 0) {
+ /*
+ * Inode is only half assembled, or there was an IO error,
+ * or the verifier failed, so don't bother trying to check.
+ * The inode scrubber can deal with this.
+ */
+ goto out;
+ } else {
+ /* Inode is all there. */
+ freemask_ok = inode_is_free ^ inuse;
+ }
+ if (!freemask_ok)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+out:
+ return 0;
+}
+
+/* Make sure the free mask is consistent with what the inodes think. */
+STATIC int
+xfs_scrub_iallocbt_check_freemask(
+ struct xfs_scrub_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_imap imap;
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *bp;
+ xfs_ino_t fsino;
+ xfs_agino_t nr_inodes;
+ xfs_agino_t agino;
+ xfs_agino_t chunkino;
+ xfs_agino_t clusterino;
+ xfs_agblock_t agbno;
+ int blks_per_cluster;
+ uint16_t holemask;
+ uint16_t ir_holemask;
+ int error = 0;
+
+ /* Make sure the freemask matches the inode records. */
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+
+ for (agino = irec->ir_startino;
+ agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
+ agino += blks_per_cluster * mp->m_sb.sb_inopblock) {
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ chunkino = agino - irec->ir_startino;
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+
+ /* Compute the holemask mask for this cluster. */
+ for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
+ clusterino += XFS_INODES_PER_HOLEMASK_BIT)
+ holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* The whole cluster must be a hole or not a hole. */
+ ir_holemask = (irec->ir_holemask & holemask);
+ if (ir_holemask != holemask && ir_holemask != 0) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ continue;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask)
+ continue;
+
+ /* Grab the inode cluster buffer. */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
+ agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap.im_boffset = 0;
+
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
+ &dip, &bp, 0, 0);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error))
+ continue;
+
+ /* Which inodes are free? */
+ for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
+ error = xfs_scrub_iallocbt_check_cluster_freemask(bs,
+ fsino, chunkino, clusterino, irec, bp);
+ if (error) {
+ xfs_trans_brelse(bs->cur->bc_tp, bp);
+ return error;
+ }
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, bp);
+ }
+
+ return error;
+}
+
+/* Scrub an inobt/finobt record. */
+STATIC int
+xfs_scrub_iallocbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_inobt_rec_incore irec;
+ uint64_t holes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agino_t agino;
+ xfs_agblock_t agbno;
+ xfs_extlen_t len;
+ int holecount;
+ int i;
+ int error = 0;
+ unsigned int real_freecount;
+ uint16_t holemask;
+
+ xfs_inobt_btrec_to_irec(mp, rec, &irec);
+
+ if (irec.ir_count > XFS_INODES_PER_CHUNK ||
+ irec.ir_freecount > XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ real_freecount = irec.ir_freecount +
+ (XFS_INODES_PER_CHUNK - irec.ir_count);
+ if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ agino = irec.ir_startino;
+ /* Record has to be properly aligned within the AG. */
+ if (!xfs_verify_agino(mp, agno, agino) ||
+ !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ goto out;
+ }
+
+ /* Make sure this record is aligned to cluster and inoalignmnt size. */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
+ if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
+ (agbno & (xfs_icluster_size_fsb(mp) - 1)))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ /* Handle non-sparse inodes */
+ if (!xfs_inobt_issparse(irec.ir_holemask)) {
+ len = XFS_B_TO_FSB(mp,
+ XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
+ if (irec.ir_count != XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
+ goto out;
+ goto check_freemask;
+ }
+
+ /* Check each chunk of a sparse inode cluster. */
+ holemask = irec.ir_holemask;
+ holecount = 0;
+ len = XFS_B_TO_FSB(mp,
+ XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
+ holes = ~xfs_inobt_irec_to_allocmask(&irec);
+ if ((holes & irec.ir_free) != holes ||
+ irec.ir_freecount > irec.ir_count)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
+ if (holemask & 1)
+ holecount += XFS_INODES_PER_HOLEMASK_BIT;
+ else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
+ break;
+ holemask >>= 1;
+ agino += XFS_INODES_PER_HOLEMASK_BIT;
+ }
+
+ if (holecount > XFS_INODES_PER_CHUNK ||
+ holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+check_freemask:
+ error = xfs_scrub_iallocbt_check_freemask(bs, &irec);
+ if (error)
+ goto out;
+
+out:
+ return error;
+}
+
+/* Scrub the inode btrees for some AG. */
+STATIC int
+xfs_scrub_iallocbt(
+ struct xfs_scrub_context *sc,
+ xfs_btnum_t which)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
+ cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+ return xfs_scrub_btree(sc, cur, xfs_scrub_iallocbt_rec, &oinfo, NULL);
+}
+
+int
+xfs_scrub_inobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_iallocbt(sc, XFS_BTNUM_INO);
+}
+
+int
+xfs_scrub_finobt(
+ struct xfs_scrub_context *sc)
+{
+ return xfs_scrub_iallocbt(sc, XFS_BTNUM_FINO);
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
new file mode 100644
index 000000000000..637b7a892313
--- /dev/null
+++ b/fs/xfs/scrub/inode.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Grab total control of the inode metadata. It doesn't matter here if
+ * the file data is still changing; exclusive access to the metadata is
+ * the goal.
+ */
+int
+xfs_scrub_setup_inode(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /*
+ * Try to get the inode. If the verifiers fail, we try again
+ * in raw mode.
+ */
+ error = xfs_scrub_get_inode(sc, ip);
+ switch (error) {
+ case 0:
+ break;
+ case -EFSCORRUPTED:
+ case -EFSBADCRC:
+ return 0;
+ default:
+ return error;
+ }
+
+ /* Got the inode, lock it and we're ready to go. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ if (error)
+ goto out;
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+
+out:
+ /* scrub teardown will unlock and release the inode for us */
+ return error;
+}
+
+/* Inode core */
+
+/*
+ * Validate di_extsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_extsize().
+ * These functions must be kept in sync with each other.
+ */
+STATIC void
+xfs_scrub_inode_extsize(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags)
+{
+ struct xfs_mount *mp = sc->mp;
+ bool rt_flag;
+ bool hint_flag;
+ bool inherit_flag;
+ uint32_t extsize;
+ uint32_t extsize_bytes;
+ uint32_t blocksize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
+ inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
+ extsize = be32_to_cpu(dip->di_extsize);
+ extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize);
+
+ if (rt_flag)
+ blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ else
+ blocksize_bytes = mp->m_sb.sb_blocksize;
+
+ if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
+ goto bad;
+
+ if (hint_flag && !S_ISREG(mode))
+ goto bad;
+
+ if (inherit_flag && !S_ISDIR(mode))
+ goto bad;
+
+ if ((hint_flag || inherit_flag) && extsize == 0)
+ goto bad;
+
+ if (!(hint_flag || inherit_flag) && extsize != 0)
+ goto bad;
+
+ if (extsize_bytes % blocksize_bytes)
+ goto bad;
+
+ if (extsize > MAXEXTLEN)
+ goto bad;
+
+ if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/*
+ * Validate di_cowextsize hint.
+ *
+ * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
+ * These functions must be kept in sync with each other.
+ */
+STATIC void
+xfs_scrub_inode_cowextsize(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ struct xfs_mount *mp = sc->mp;
+ bool rt_flag;
+ bool hint_flag;
+ uint32_t extsize;
+ uint32_t extsize_bytes;
+
+ rt_flag = (flags & XFS_DIFLAG_REALTIME);
+ hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
+ extsize = be32_to_cpu(dip->di_cowextsize);
+ extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize);
+
+ if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb))
+ goto bad;
+
+ if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode)))
+ goto bad;
+
+ if (hint_flag && extsize == 0)
+ goto bad;
+
+ if (!hint_flag && extsize != 0)
+ goto bad;
+
+ if (hint_flag && rt_flag)
+ goto bad;
+
+ if (extsize_bytes % mp->m_sb.sb_blocksize)
+ goto bad;
+
+ if (extsize > MAXEXTLEN)
+ goto bad;
+
+ if (extsize > mp->m_sb.sb_agblocks / 2)
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/* Make sure the di_flags make sense for the inode. */
+STATIC void
+xfs_scrub_inode_flags(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (flags & ~XFS_DIFLAG_ANY)
+ goto bad;
+
+ /* rt flags require rt device */
+ if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) &&
+ !mp->m_rtdev_targp)
+ goto bad;
+
+ /* new rt bitmap flag only valid for rbmino */
+ if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino)
+ goto bad;
+
+ /* directory-only flags */
+ if ((flags & (XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS)) &&
+ !S_ISDIR(mode))
+ goto bad;
+
+ /* file-only flags */
+ if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) &&
+ !S_ISREG(mode))
+ goto bad;
+
+ /* filestreams and rt make no sense */
+ if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME))
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/* Make sure the di_flags2 make sense for the inode. */
+STATIC void
+xfs_scrub_inode_flags2(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ if (flags2 & ~XFS_DIFLAG2_ANY)
+ goto bad;
+
+ /* reflink flag requires reflink feature */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) &&
+ !xfs_sb_version_hasreflink(&mp->m_sb))
+ goto bad;
+
+ /* cowextsize flag is checked w.r.t. mode separately */
+
+ /* file/dir-only flags */
+ if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode)))
+ goto bad;
+
+ /* file-only flags */
+ if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode))
+ goto bad;
+
+ /* realtime and reflink make no sense, currently */
+ if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK))
+ goto bad;
+
+ /* dax and reflink make no sense, currently */
+ if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
+ goto bad;
+
+ return;
+bad:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+}
+
+/* Scrub all the ondisk inode fields. */
+STATIC void
+xfs_scrub_dinode(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *bp,
+ struct xfs_dinode *dip,
+ xfs_ino_t ino)
+{
+ struct xfs_mount *mp = sc->mp;
+ size_t fork_recs;
+ unsigned long long isize;
+ uint64_t flags2;
+ uint32_t nextents;
+ uint16_t flags;
+ uint16_t mode;
+
+ flags = be16_to_cpu(dip->di_flags);
+ if (dip->di_version >= 3)
+ flags2 = be64_to_cpu(dip->di_flags2);
+ else
+ flags2 = 0;
+
+ /* di_mode */
+ mode = be16_to_cpu(dip->di_mode);
+ if (mode & ~(S_IALLUGO | S_IFMT))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* v1/v2 fields */
+ switch (dip->di_version) {
+ case 1:
+ /*
+ * We autoconvert v1 inodes into v2 inodes on writeout,
+ * so just mark this inode for preening.
+ */
+ xfs_scrub_ino_set_preen(sc, ino, bp);
+ break;
+ case 2:
+ case 3:
+ if (dip->di_onlink != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ if (dip->di_mode == 0 && sc->ip)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ if (dip->di_projid_hi != 0 &&
+ !xfs_sb_version_hasprojid32bit(&mp->m_sb))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ return;
+ }
+
+ /*
+ * di_uid/di_gid -- -1 isn't invalid, but there's no way that
+ * userspace could have created that.
+ */
+ if (dip->di_uid == cpu_to_be32(-1U) ||
+ dip->di_gid == cpu_to_be32(-1U))
+ xfs_scrub_ino_set_warning(sc, ino, bp);
+
+ /* di_format */
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ if (!S_ISCHR(mode) && !S_ISBLK(mode) &&
+ !S_ISFIFO(mode) && !S_ISSOCK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ if (!S_ISDIR(mode) && !S_ISLNK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (!S_ISREG(mode) && !S_ISDIR(mode))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_UUID:
+ default:
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ }
+
+ /*
+ * di_size. xfs_dinode_verify checks for things that screw up
+ * the VFS such as the upper bit being set and zero-length
+ * symlinks/directories, but we can do more here.
+ */
+ isize = be64_to_cpu(dip->di_size);
+ if (isize & (1ULL << 63))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* Devices, fifos, and sockets must have zero size */
+ if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* Directories can't be larger than the data section size (32G) */
+ if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* Symlinks can't be larger than SYMLINK_MAXLEN */
+ if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN))
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /*
+ * Warn if the running kernel can't handle the kinds of offsets
+ * needed to deal with the file size. In other words, if the
+ * pagecache can't cache all the blocks in this file due to
+ * overly large offsets, flag the inode for admin review.
+ */
+ if (isize >= mp->m_super->s_maxbytes)
+ xfs_scrub_ino_set_warning(sc, ino, bp);
+
+ /* di_nblocks */
+ if (flags2 & XFS_DIFLAG2_REFLINK) {
+ ; /* nblocks can exceed dblocks */
+ } else if (flags & XFS_DIFLAG_REALTIME) {
+ /*
+ * nblocks is the sum of data extents (in the rtdev),
+ * attr extents (in the datadev), and both forks' bmbt
+ * blocks (in the datadev). This clumsy check is the
+ * best we can do without cross-referencing with the
+ * inode forks.
+ */
+ if (be64_to_cpu(dip->di_nblocks) >=
+ mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ } else {
+ if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ }
+
+ xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags);
+
+ xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags);
+
+ /* di_nextents */
+ nextents = be32_to_cpu(dip->di_nextents);
+ fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ if (nextents > fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (nextents <= fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ default:
+ if (nextents != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ }
+
+ /* di_forkoff */
+ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* di_aformat */
+ if (dip->di_aformat != XFS_DINODE_FMT_LOCAL &&
+ dip->di_aformat != XFS_DINODE_FMT_EXTENTS &&
+ dip->di_aformat != XFS_DINODE_FMT_BTREE)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+
+ /* di_anextents */
+ nextents = be16_to_cpu(dip->di_anextents);
+ fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_EXTENTS:
+ if (nextents > fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (nextents <= fork_recs)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ break;
+ default:
+ if (nextents != 0)
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ }
+
+ if (dip->di_version >= 3) {
+ xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2);
+ xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags,
+ flags2);
+ }
+}
+
+/* Map and read a raw inode. */
+STATIC int
+xfs_scrub_inode_map_raw(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t ino,
+ struct xfs_buf **bpp,
+ struct xfs_dinode **dipp)
+{
+ struct xfs_imap imap;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp = NULL;
+ struct xfs_dinode *dip;
+ int error;
+
+ error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED);
+ if (error == -EINVAL) {
+ /*
+ * Inode could have gotten deleted out from under us;
+ * just forget about it.
+ */
+ error = -ENOENT;
+ goto out;
+ }
+ if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ goto out;
+
+ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+ imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp,
+ NULL);
+ if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ goto out;
+
+ /*
+ * Is this really an inode? We disabled verifiers in the above
+ * xfs_trans_read_buf call because the inode buffer verifier
+ * fails on /any/ inode record in the inode cluster with a bad
+ * magic or version number, not just the one that we're
+ * checking. Therefore, grab the buffer unconditionally, attach
+ * the inode verifiers by hand, and run the inode verifier only
+ * on the one inode we want.
+ */
+ bp->b_ops = &xfs_inode_buf_ops;
+ dip = xfs_buf_offset(bp, imap.im_boffset);
+ if (!xfs_dinode_verify(mp, ino, dip) ||
+ !xfs_dinode_good_version(mp, dip->di_version)) {
+ xfs_scrub_ino_set_corrupt(sc, ino, bp);
+ goto out_buf;
+ }
+
+ /* ...and is it the one we asked for? */
+ if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) {
+ error = -ENOENT;
+ goto out_buf;
+ }
+
+ *dipp = dip;
+ *bpp = bp;
+out:
+ return error;
+out_buf:
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+}
+
+/* Scrub an inode. */
+int
+xfs_scrub_inode(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_dinode di;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp = NULL;
+ struct xfs_dinode *dip;
+ xfs_ino_t ino;
+
+ bool has_shared;
+ int error = 0;
+
+ /* Did we get the in-core inode, or are we doing this manually? */
+ if (sc->ip) {
+ ino = sc->ip->i_ino;
+ xfs_inode_to_disk(sc->ip, &di, 0);
+ dip = &di;
+ } else {
+ /* Map & read inode. */
+ ino = sc->sm->sm_ino;
+ error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip);
+ if (error || !bp)
+ goto out;
+ }
+
+ xfs_scrub_dinode(sc, bp, dip, ino);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Now let's do the things that require a live inode. */
+ if (!sc->ip)
+ goto out;
+
+ /*
+ * Does this inode have the reflink flag set but no shared extents?
+ * Set the preening flag if this is the case.
+ */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+ &has_shared);
+ if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino),
+ XFS_INO_TO_AGBNO(mp, ino), &error))
+ goto out;
+ if (!has_shared)
+ xfs_scrub_ino_set_preen(sc, ino, bp);
+ }
+
+out:
+ if (bp)
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
new file mode 100644
index 000000000000..63a25334fc83
--- /dev/null
+++ b/fs/xfs/scrub/parent.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ialloc.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up to scrub parents. */
+int
+xfs_scrub_setup_parent(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Parent pointers */
+
+/* Look for an entry in a parent pointing to this inode. */
+
+struct xfs_scrub_parent_ctx {
+ struct dir_context dc;
+ xfs_ino_t ino;
+ xfs_nlink_t nlink;
+};
+
+/* Look for a single entry in a directory pointing to an inode. */
+STATIC int
+xfs_scrub_parent_actor(
+ struct dir_context *dc,
+ const char *name,
+ int namelen,
+ loff_t pos,
+ u64 ino,
+ unsigned type)
+{
+ struct xfs_scrub_parent_ctx *spc;
+
+ spc = container_of(dc, struct xfs_scrub_parent_ctx, dc);
+ if (spc->ino == ino)
+ spc->nlink++;
+ return 0;
+}
+
+/* Count the number of dentries in the parent dir that point to this inode. */
+STATIC int
+xfs_scrub_parent_count_parent_dentries(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *parent,
+ xfs_nlink_t *nlink)
+{
+ struct xfs_scrub_parent_ctx spc = {
+ .dc.actor = xfs_scrub_parent_actor,
+ .dc.pos = 0,
+ .ino = sc->ip->i_ino,
+ .nlink = 0,
+ };
+ size_t bufsize;
+ loff_t oldpos;
+ uint lock_mode;
+ int error = 0;
+
+ /*
+ * If there are any blocks, read-ahead block 0 as we're almost
+ * certain to have the next operation be a read there. This is
+ * how we guarantee that the parent's extent map has been loaded,
+ * if there is one.
+ */
+ lock_mode = xfs_ilock_data_map_shared(parent);
+ if (parent->i_d.di_nextents > 0)
+ error = xfs_dir3_data_readahead(parent, 0, -1);
+ xfs_iunlock(parent, lock_mode);
+ if (error)
+ return error;
+
+ /*
+ * Iterate the parent dir to confirm that there is
+ * exactly one entry pointing back to the inode being
+ * scanned.
+ */
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
+ parent->i_d.di_size);
+ oldpos = 0;
+ while (true) {
+ error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
+ if (error)
+ goto out;
+ if (oldpos == spc.dc.pos)
+ break;
+ oldpos = spc.dc.pos;
+ }
+ *nlink = spc.nlink;
+out:
+ return error;
+}
+
+/*
+ * Given the inode number of the alleged parent of the inode being
+ * scrubbed, try to validate that the parent has exactly one directory
+ * entry pointing back to the inode being scrubbed.
+ */
+STATIC int
+xfs_scrub_parent_validate(
+ struct xfs_scrub_context *sc,
+ xfs_ino_t dnum,
+ bool *try_again)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *dp = NULL;
+ xfs_nlink_t expected_nlink;
+ xfs_nlink_t nlink;
+ int error = 0;
+
+ *try_again = false;
+
+ /* '..' must not point to ourselves. */
+ if (sc->ip->i_ino == dnum) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /*
+ * If we're an unlinked directory, the parent /won't/ have a link
+ * to us. Otherwise, it should have one link.
+ */
+ expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+ /*
+ * Grab this parent inode. We release the inode before we
+ * cancel the scrub transaction. Since we're don't know a
+ * priori that releasing the inode won't trigger eofblocks
+ * cleanup (which allocates what would be a nested transaction)
+ * if the parent pointer erroneously points to a file, we
+ * can't use DONTCACHE here because DONTCACHE inodes can trigger
+ * immediate inactive cleanup of the inode.
+ */
+ error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (dp == sc->ip) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_rele;
+ }
+
+ /*
+ * We prefer to keep the inode locked while we lock and search
+ * its alleged parent for a forward reference. If we can grab
+ * the iolock, validate the pointers and we're done. We must
+ * use nowait here to avoid an ABBA deadlock on the parent and
+ * the child inodes.
+ */
+ if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) {
+ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ goto out_unlock;
+ if (nlink != expected_nlink)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_unlock;
+ }
+
+ /*
+ * The game changes if we get here. We failed to lock the parent,
+ * so we're going to try to verify both pointers while only holding
+ * one lock so as to avoid deadlocking with something that's actually
+ * trying to traverse down the directory tree.
+ */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = 0;
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
+
+ /* Go looking for our dentry. */
+ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_unlock;
+
+ /* Drop the parent lock, relock this inode. */
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /*
+ * If we're an unlinked directory, the parent /won't/ have a link
+ * to us. Otherwise, it should have one link. We have to re-set
+ * it here because we dropped the lock on sc->ip.
+ */
+ expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
+
+ /* Look up '..' to see if the inode changed. */
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_rele;
+
+ /* Drat, parent changed. Try again! */
+ if (dnum != dp->i_ino) {
+ iput(VFS_I(dp));
+ *try_again = true;
+ return 0;
+ }
+ iput(VFS_I(dp));
+
+ /*
+ * '..' didn't change, so check that there was only one entry
+ * for us in the parent.
+ */
+ if (nlink != expected_nlink)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return error;
+
+out_unlock:
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+out_rele:
+ iput(VFS_I(dp));
+out:
+ return error;
+}
+
+/* Scrub a parent pointer. */
+int
+xfs_scrub_parent(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t dnum;
+ bool try_again;
+ int tries = 0;
+ int error = 0;
+
+ /*
+ * If we're a directory, check that the '..' link points up to
+ * a directory that has one entry pointing to us.
+ */
+ if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
+ return -ENOENT;
+
+ /* We're not a special inode, are we? */
+ if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /*
+ * The VFS grabs a read or write lock via i_rwsem before it reads
+ * or writes to a directory. If we've gotten this far we've
+ * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
+ * getting a write lock on i_rwsem. Therefore, it is safe for us
+ * to drop the ILOCK here in order to do directory lookups.
+ */
+ sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
+ /* Look up '..' */
+ error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (!xfs_verify_dir_ino(mp, dnum)) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (sc->ip == mp->m_rootip) {
+ if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
+ sc->ip->i_ino != dnum)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ do {
+ error = xfs_scrub_parent_validate(sc, dnum, &try_again);
+ if (error)
+ goto out;
+ } while (try_again && ++tries < 20);
+
+ /*
+ * We gave it our best shot but failed, so mark this scrub
+ * incomplete. Userspace can decide if it wants to try again.
+ */
+ if (try_again && tries == 20)
+ xfs_scrub_set_incomplete(sc);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
new file mode 100644
index 000000000000..8e58ba842946
--- /dev/null
+++ b/fs/xfs/scrub/quota.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Convert a scrub type code to a DQ flag, or return 0 if error. */
+static inline uint
+xfs_scrub_quota_to_dqtype(
+ struct xfs_scrub_context *sc)
+{
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_UQUOTA:
+ return XFS_DQ_USER;
+ case XFS_SCRUB_TYPE_GQUOTA:
+ return XFS_DQ_GROUP;
+ case XFS_SCRUB_TYPE_PQUOTA:
+ return XFS_DQ_PROJ;
+ default:
+ return 0;
+ }
+}
+
+/* Set us up to scrub a quota. */
+int
+xfs_scrub_setup_quota(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ uint dqtype;
+
+ /*
+ * If userspace gave us an AG number or inode data, they don't
+ * know what they're doing. Get out.
+ */
+ if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen)
+ return -EINVAL;
+
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+ if (dqtype == 0)
+ return -EINVAL;
+ if (!xfs_this_quota_on(sc->mp, dqtype))
+ return -ENOENT;
+ return 0;
+}
+
+/* Quotas. */
+
+/* Scrub the fields in an individual quota item. */
+STATIC void
+xfs_scrub_quota_item(
+ struct xfs_scrub_context *sc,
+ uint dqtype,
+ struct xfs_dquot *dq,
+ xfs_dqid_t id)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_disk_dquot *d = &dq->q_core;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ xfs_fileoff_t offset;
+ unsigned long long bsoft;
+ unsigned long long isoft;
+ unsigned long long rsoft;
+ unsigned long long bhard;
+ unsigned long long ihard;
+ unsigned long long rhard;
+ unsigned long long bcount;
+ unsigned long long icount;
+ unsigned long long rcount;
+ xfs_ino_t fs_icount;
+
+ offset = id * qi->qi_dqperchunk;
+
+ /*
+ * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
+ * that the actual dquot we got must either have the same id or
+ * the next higher id.
+ */
+ if (id > be32_to_cpu(d->d_id))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Did we get the dquot type we wanted? */
+ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Check the limits. */
+ bhard = be64_to_cpu(d->d_blk_hardlimit);
+ ihard = be64_to_cpu(d->d_ino_hardlimit);
+ rhard = be64_to_cpu(d->d_rtb_hardlimit);
+
+ bsoft = be64_to_cpu(d->d_blk_softlimit);
+ isoft = be64_to_cpu(d->d_ino_softlimit);
+ rsoft = be64_to_cpu(d->d_rtb_softlimit);
+
+ /*
+ * Warn if the hard limits are larger than the fs.
+ * Administrators can do this, though in production this seems
+ * suspect, which is why we flag it for review.
+ *
+ * Complain about corruption if the soft limit is greater than
+ * the hard limit.
+ */
+ if (bhard > mp->m_sb.sb_dblocks)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (bsoft > bhard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (ihard > mp->m_maxicount)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (isoft > ihard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ if (rhard > mp->m_sb.sb_rblocks)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (rsoft > rhard)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /* Check the resource counts. */
+ bcount = be64_to_cpu(d->d_bcount);
+ icount = be64_to_cpu(d->d_icount);
+ rcount = be64_to_cpu(d->d_rtbcount);
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_dblocks < bcount)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK,
+ offset);
+ } else {
+ if (mp->m_sb.sb_dblocks < bcount)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ offset);
+ }
+ if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /*
+ * We can violate the hard limits if the admin suddenly sets a
+ * lower limit than the actual usage. However, we flag it for
+ * admin review.
+ */
+ if (id != 0 && bhard != 0 && bcount > bhard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (id != 0 && ihard != 0 && icount > ihard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (id != 0 && rhard != 0 && rcount > rhard)
+ xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+}
+
+/* Scrub all of a quota type's items. */
+int
+xfs_scrub_quota(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *dq;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off = 0;
+ xfs_dqid_t id = 0;
+ uint dqtype;
+ int nimaps;
+ int error;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return -ENOENT;
+
+ mutex_lock(&qi->qi_quotaofflock);
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
+ if (!xfs_this_quota_on(sc->mp, dqtype)) {
+ error = -ENOENT;
+ goto out_unlock_quota;
+ }
+
+ /* Attach to the quota inode and set sc->ip so that reporting works. */
+ ip = xfs_quota_inode(sc->mp, dqtype);
+ sc->ip = ip;
+
+ /* Look for problem extents. */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL);
+ goto out_unlock_inode;
+ }
+ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ while (1) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+
+ off = irec.br_startoff + irec.br_blockcount;
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps,
+ XFS_BMAPI_ENTIRE);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
+ &error))
+ goto out_unlock_inode;
+ if (!nimaps)
+ break;
+ if (irec.br_startblock == HOLESTARTBLOCK)
+ continue;
+
+ /* Check the extent record doesn't point to crap. */
+ if (irec.br_startblock + irec.br_blockcount <=
+ irec.br_startblock)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+ if (!xfs_verify_fsbno(mp, irec.br_startblock) ||
+ !xfs_verify_fsbno(mp, irec.br_startblock +
+ irec.br_blockcount - 1))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+
+ /*
+ * Unwritten extents or blocks mapped above the highest
+ * quota id shouldn't happen.
+ */
+ if (isnullstartblock(irec.br_startblock) ||
+ irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /* Check all the quota items. */
+ while (id < ((xfs_dqid_t)-1ULL)) {
+ if (xfs_scrub_should_terminate(sc, &error))
+ break;
+
+ error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT,
+ &dq);
+ if (error == -ENOENT)
+ break;
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
+ id * qi->qi_dqperchunk, &error))
+ break;
+
+ xfs_scrub_quota_item(sc, dqtype, dq, id);
+
+ id = be32_to_cpu(dq->q_core.d_id) + 1;
+ xfs_qm_dqput(dq);
+ if (!id)
+ break;
+ }
+
+out:
+ /* We set sc->ip earlier, so make sure we clear it now. */
+ sc->ip = NULL;
+out_unlock_quota:
+ mutex_unlock(&qi->qi_quotaofflock);
+ return error;
+
+out_unlock_inode:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
+}
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
new file mode 100644
index 000000000000..2f88a8d44bd0
--- /dev/null
+++ b/fs/xfs/scrub/refcount.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub reference count btrees.
+ */
+int
+xfs_scrub_setup_ag_refcountbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Reference count btree scrubber. */
+
+/* Scrub a refcountbt record. */
+STATIC int
+xfs_scrub_refcountbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+ xfs_nlink_t refcount;
+ bool has_cowflag;
+ int error = 0;
+
+ bno = be32_to_cpu(rec->refc.rc_startblock);
+ len = be32_to_cpu(rec->refc.rc_blockcount);
+ refcount = be32_to_cpu(rec->refc.rc_refcount);
+
+ /* Only CoW records can have refcount == 1. */
+ has_cowflag = (bno & XFS_REFC_COW_START);
+ if ((refcount == 1 && !has_cowflag) || (refcount != 1 && has_cowflag))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ /* Check the extent. */
+ bno &= ~XFS_REFC_COW_START;
+ if (bno + len <= bno ||
+ !xfs_verify_agbno(mp, agno, bno) ||
+ !xfs_verify_agbno(mp, agno, bno + len - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (refcount == 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ return error;
+}
+
+/* Scrub the refcount btree for some AG. */
+int
+xfs_scrub_refcountbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+ return xfs_scrub_btree(sc, sc->sa.refc_cur, xfs_scrub_refcountbt_rec,
+ &oinfo, NULL);
+}
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
new file mode 100644
index 000000000000..97846c424690
--- /dev/null
+++ b/fs/xfs/scrub/rmap.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/*
+ * Set us up to scrub reverse mapping btrees.
+ */
+int
+xfs_scrub_setup_ag_rmapbt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ return xfs_scrub_setup_ag_btree(sc, ip, false);
+}
+
+/* Reverse-mapping scrubber. */
+
+/* Scrub an rmapbt record. */
+STATIC int
+xfs_scrub_rmapbt_rec(
+ struct xfs_scrub_btree *bs,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_rmap_irec irec;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ bool non_inode;
+ bool is_unwritten;
+ bool is_bmbt;
+ bool is_attr;
+ int error;
+
+ error = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error))
+ goto out;
+
+ /* Check extent. */
+ if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (irec.rm_owner == XFS_RMAP_OWN_FS) {
+ /*
+ * xfs_verify_agbno returns false for static fs metadata.
+ * Since that only exists at the start of the AG, validate
+ * that by hand.
+ */
+ if (irec.rm_startblock != 0 ||
+ irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ } else {
+ /*
+ * Otherwise we must point somewhere past the static metadata
+ * but before the end of the FS. Run the regular check.
+ */
+ if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
+ !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+ irec.rm_blockcount - 1))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ }
+
+ /* Check flags. */
+ non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner);
+ is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN;
+
+ if (is_bmbt && irec.rm_offset != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (non_inode && irec.rm_offset != 0)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (is_unwritten && (is_bmbt || non_inode || is_attr))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (non_inode && (is_bmbt || is_unwritten || is_attr))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ if (!non_inode) {
+ if (!xfs_verify_ino(mp, irec.rm_owner))
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ } else {
+ /* Non-inode owner within the magic values? */
+ if (irec.rm_owner <= XFS_RMAP_OWN_MIN ||
+ irec.rm_owner > XFS_RMAP_OWN_FS)
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
+ }
+out:
+ return error;
+}
+
+/* Scrub the rmap btree for some AG. */
+int
+xfs_scrub_rmapbt(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_owner_info oinfo;
+
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ return xfs_scrub_btree(sc, sc->sa.rmap_cur, xfs_scrub_rmapbt_rec,
+ &oinfo, NULL);
+}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
new file mode 100644
index 000000000000..c6fedb698008
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up with the realtime metadata locked. */
+int
+xfs_scrub_setup_rt(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ /*
+ * If userspace gave us an AG number or inode data, they don't
+ * know what they're doing. Get out.
+ */
+ if (sc->sm->sm_agno || sc->sm->sm_ino || sc->sm->sm_gen)
+ return -EINVAL;
+
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+
+ sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
+ sc->ip = mp->m_rbmip;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ return 0;
+}
+
+/* Realtime bitmap. */
+
+/* Scrub a free extent record from the realtime bitmap. */
+STATIC int
+xfs_scrub_rtbitmap_rec(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_scrub_context *sc = priv;
+
+ if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock ||
+ !xfs_verify_rtbno(sc->mp, rec->ar_startblock) ||
+ !xfs_verify_rtbno(sc->mp, rec->ar_startblock +
+ rec->ar_blockcount - 1))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+}
+
+/* Scrub the realtime bitmap. */
+int
+xfs_scrub_rtbitmap(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+
+out:
+ return error;
+}
+
+/* Scrub the realtime summary. */
+int
+xfs_scrub_rtsummary(
+ struct xfs_scrub_context *sc)
+{
+ /* XXX: implement this some day */
+ return -ENOENT;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
new file mode 100644
index 000000000000..9c42c4efd01e
--- /dev/null
+++ b/fs/xfs/scrub/scrub.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/scrub.h"
+#include "scrub/btree.h"
+
+/*
+ * Online Scrub and Repair
+ *
+ * Traditionally, XFS (the kernel driver) did not know how to check or
+ * repair on-disk data structures. That task was left to the xfs_check
+ * and xfs_repair tools, both of which require taking the filesystem
+ * offline for a thorough but time consuming examination. Online
+ * scrub & repair, on the other hand, enables us to check the metadata
+ * for obvious errors while carefully stepping around the filesystem's
+ * ongoing operations, locking rules, etc.
+ *
+ * Given that most XFS metadata consist of records stored in a btree,
+ * most of the checking functions iterate the btree blocks themselves
+ * looking for irregularities. When a record block is encountered, each
+ * record can be checked for obviously bad values. Record values can
+ * also be cross-referenced against other btrees to look for potential
+ * misunderstandings between pieces of metadata.
+ *
+ * It is expected that the checkers responsible for per-AG metadata
+ * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
+ * metadata structure, and perform any relevant cross-referencing before
+ * unlocking the AG and returning the results to userspace. These
+ * scrubbers must not keep an AG locked for too long to avoid tying up
+ * the block and inode allocators.
+ *
+ * Block maps and b-trees rooted in an inode present a special challenge
+ * because they can involve extents from any AG. The general scrubber
+ * structure of lock -> check -> xref -> unlock still holds, but AG
+ * locking order rules /must/ be obeyed to avoid deadlocks. The
+ * ordering rule, of course, is that we must lock in increasing AG
+ * order. Helper functions are provided to track which AG headers we've
+ * already locked. If we detect an imminent locking order violation, we
+ * can signal a potential deadlock, in which case the scrubber can jump
+ * out to the top level, lock all the AGs in order, and retry the scrub.
+ *
+ * For file data (directories, extended attributes, symlinks) scrub, we
+ * can simply lock the inode and walk the data. For btree data
+ * (directories and attributes) we follow the same btree-scrubbing
+ * strategy outlined previously to check the records.
+ *
+ * We use a bit of trickery with transactions to avoid buffer deadlocks
+ * if there is a cycle in the metadata. The basic problem is that
+ * travelling down a btree involves locking the current buffer at each
+ * tree level. If a pointer should somehow point back to a buffer that
+ * we've already examined, we will deadlock due to the second buffer
+ * locking attempt. Note however that grabbing a buffer in transaction
+ * context links the locked buffer to the transaction. If we try to
+ * re-grab the buffer in the context of the same transaction, we avoid
+ * the second lock attempt and continue. Between the verifier and the
+ * scrubber, something will notice that something is amiss and report
+ * the corruption. Therefore, each scrubber will allocate an empty
+ * transaction, attach buffers to it, and cancel the transaction at the
+ * end of the scrub run. Cancelling a non-dirty transaction simply
+ * unlocks the buffers.
+ *
+ * There are four pieces of data that scrub can communicate to
+ * userspace. The first is the error code (errno), which can be used to
+ * communicate operational errors in performing the scrub. There are
+ * also three flags that can be set in the scrub context. If the data
+ * structure itself is corrupt, the CORRUPT flag will be set. If
+ * the metadata is correct but otherwise suboptimal, the PREEN flag
+ * will be set.
+ */
+
+/*
+ * Scrub probe -- userspace uses this to probe if we're willing to scrub
+ * or repair a given mountpoint. This will be used by xfs_scrub to
+ * probe the kernel's abilities to scrub (and repair) the metadata. We
+ * do this by validating the ioctl inputs from userspace, preparing the
+ * filesystem for a scrub (or a repair) operation, and immediately
+ * returning to userspace. Userspace can use the returned errno and
+ * structure state to decide (in broad terms) if scrub/repair are
+ * supported by the running kernel.
+ */
+static int
+xfs_scrub_probe(
+ struct xfs_scrub_context *sc)
+{
+ int error = 0;
+
+ if (sc->sm->sm_ino || sc->sm->sm_agno)
+ return -EINVAL;
+ if (xfs_scrub_should_terminate(sc, &error))
+ return error;
+
+ return 0;
+}
+
+/* Scrub setup and teardown */
+
+/* Free all the resources and finish the transactions. */
+STATIC int
+xfs_scrub_teardown(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip_in,
+ int error)
+{
+ xfs_scrub_ag_free(sc, &sc->sa);
+ if (sc->tp) {
+ xfs_trans_cancel(sc->tp);
+ sc->tp = NULL;
+ }
+ if (sc->ip) {
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ if (sc->ip != ip_in &&
+ !xfs_internal_inum(sc->mp, sc->ip->i_ino))
+ iput(VFS_I(sc->ip));
+ sc->ip = NULL;
+ }
+ if (sc->buf) {
+ kmem_free(sc->buf);
+ sc->buf = NULL;
+ }
+ return error;
+}
+
+/* Scrubbing dispatch. */
+
+static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
+ { /* ioctl presence test */
+ .setup = xfs_scrub_setup_fs,
+ .scrub = xfs_scrub_probe,
+ },
+ { /* superblock */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_superblock,
+ },
+ { /* agf */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_agf,
+ },
+ { /* agfl */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_agfl,
+ },
+ { /* agi */
+ .setup = xfs_scrub_setup_ag_header,
+ .scrub = xfs_scrub_agi,
+ },
+ { /* bnobt */
+ .setup = xfs_scrub_setup_ag_allocbt,
+ .scrub = xfs_scrub_bnobt,
+ },
+ { /* cntbt */
+ .setup = xfs_scrub_setup_ag_allocbt,
+ .scrub = xfs_scrub_cntbt,
+ },
+ { /* inobt */
+ .setup = xfs_scrub_setup_ag_iallocbt,
+ .scrub = xfs_scrub_inobt,
+ },
+ { /* finobt */
+ .setup = xfs_scrub_setup_ag_iallocbt,
+ .scrub = xfs_scrub_finobt,
+ .has = xfs_sb_version_hasfinobt,
+ },
+ { /* rmapbt */
+ .setup = xfs_scrub_setup_ag_rmapbt,
+ .scrub = xfs_scrub_rmapbt,
+ .has = xfs_sb_version_hasrmapbt,
+ },
+ { /* refcountbt */
+ .setup = xfs_scrub_setup_ag_refcountbt,
+ .scrub = xfs_scrub_refcountbt,
+ .has = xfs_sb_version_hasreflink,
+ },
+ { /* inode record */
+ .setup = xfs_scrub_setup_inode,
+ .scrub = xfs_scrub_inode,
+ },
+ { /* inode data fork */
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_data,
+ },
+ { /* inode attr fork */
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_attr,
+ },
+ { /* inode CoW fork */
+ .setup = xfs_scrub_setup_inode_bmap,
+ .scrub = xfs_scrub_bmap_cow,
+ },
+ { /* directory */
+ .setup = xfs_scrub_setup_directory,
+ .scrub = xfs_scrub_directory,
+ },
+ { /* extended attributes */
+ .setup = xfs_scrub_setup_xattr,
+ .scrub = xfs_scrub_xattr,
+ },
+ { /* symbolic link */
+ .setup = xfs_scrub_setup_symlink,
+ .scrub = xfs_scrub_symlink,
+ },
+ { /* parent pointers */
+ .setup = xfs_scrub_setup_parent,
+ .scrub = xfs_scrub_parent,
+ },
+ { /* realtime bitmap */
+ .setup = xfs_scrub_setup_rt,
+ .scrub = xfs_scrub_rtbitmap,
+ .has = xfs_sb_version_hasrealtime,
+ },
+ { /* realtime summary */
+ .setup = xfs_scrub_setup_rt,
+ .scrub = xfs_scrub_rtsummary,
+ .has = xfs_sb_version_hasrealtime,
+ },
+ { /* user quota */
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+ { /* group quota */
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+ { /* project quota */
+ .setup = xfs_scrub_setup_quota,
+ .scrub = xfs_scrub_quota,
+ },
+};
+
+/* This isn't a stable feature, warn once per day. */
+static inline void
+xfs_scrub_experimental_warning(
+ struct xfs_mount *mp)
+{
+ static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
+ "xfs_scrub_warning", 86400 * HZ, 1);
+ ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
+
+ if (__ratelimit(&scrub_warning))
+ xfs_alert(mp,
+"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
+}
+
+/* Dispatch metadata scrubbing. */
+int
+xfs_scrub_metadata(
+ struct xfs_inode *ip,
+ struct xfs_scrub_metadata *sm)
+{
+ struct xfs_scrub_context sc;
+ struct xfs_mount *mp = ip->i_mount;
+ const struct xfs_scrub_meta_ops *ops;
+ bool try_harder = false;
+ int error = 0;
+
+ trace_xfs_scrub_start(ip, sm, error);
+
+ /* Forbidden if we are shut down or mounted norecovery. */
+ error = -ESHUTDOWN;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out;
+ error = -ENOTRECOVERABLE;
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ goto out;
+
+ /* Check our inputs. */
+ error = -EINVAL;
+ sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
+ goto out;
+ if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
+ goto out;
+
+ /* Do we know about this type of metadata? */
+ error = -ENOENT;
+ if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
+ goto out;
+ ops = &meta_scrub_ops[sm->sm_type];
+ if (ops->scrub == NULL)
+ goto out;
+
+ /*
+ * We won't scrub any filesystem that doesn't have the ability
+ * to record unwritten extents. The option was made default in
+ * 2003, removed from mkfs in 2007, and cannot be disabled in
+ * v5, so if we find a filesystem without this flag it's either
+ * really old or totally unsupported. Avoid it either way.
+ * We also don't support v1-v3 filesystems, which aren't
+ * mountable.
+ */
+ error = -EOPNOTSUPP;
+ if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
+ goto out;
+
+ /* Does this fs even support this type of metadata? */
+ error = -ENOENT;
+ if (ops->has && !ops->has(&mp->m_sb))
+ goto out;
+
+ /* We don't know how to repair anything yet. */
+ error = -EOPNOTSUPP;
+ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ goto out;
+
+ xfs_scrub_experimental_warning(mp);
+
+retry_op:
+ /* Set up for the operation. */
+ memset(&sc, 0, sizeof(sc));
+ sc.mp = ip->i_mount;
+ sc.sm = sm;
+ sc.ops = ops;
+ sc.try_harder = try_harder;
+ sc.sa.agno = NULLAGNUMBER;
+ error = sc.ops->setup(&sc, ip);
+ if (error)
+ goto out_teardown;
+
+ /* Scrub for errors. */
+ error = sc.ops->scrub(&sc);
+ if (!try_harder && error == -EDEADLOCK) {
+ /*
+ * Scrubbers return -EDEADLOCK to mean 'try harder'.
+ * Tear down everything we hold, then set up again with
+ * preparation for worst-case scenarios.
+ */
+ error = xfs_scrub_teardown(&sc, ip, 0);
+ if (error)
+ goto out;
+ try_harder = true;
+ goto retry_op;
+ } else if (error)
+ goto out_teardown;
+
+ if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT))
+ xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
+
+out_teardown:
+ error = xfs_scrub_teardown(&sc, ip, error);
+out:
+ trace_xfs_scrub_done(ip, sm, error);
+ if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
+ sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ error = 0;
+ }
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
new file mode 100644
index 000000000000..e9ec041cf713
--- /dev/null
+++ b/fs/xfs/scrub/scrub.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_SCRUB_H__
+#define __XFS_SCRUB_SCRUB_H__
+
+struct xfs_scrub_context;
+
+struct xfs_scrub_meta_ops {
+ /* Acquire whatever resources are needed for the operation. */
+ int (*setup)(struct xfs_scrub_context *,
+ struct xfs_inode *);
+
+ /* Examine metadata for errors. */
+ int (*scrub)(struct xfs_scrub_context *);
+
+ /* Decide if we even have this piece of metadata. */
+ bool (*has)(struct xfs_sb *);
+};
+
+/* Buffer pointers and btree cursors for an entire AG. */
+struct xfs_scrub_ag {
+ xfs_agnumber_t agno;
+
+ /* AG btree roots */
+ struct xfs_buf *agf_bp;
+ struct xfs_buf *agfl_bp;
+ struct xfs_buf *agi_bp;
+
+ /* AG btrees */
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_btree_cur *ino_cur;
+ struct xfs_btree_cur *fino_cur;
+ struct xfs_btree_cur *rmap_cur;
+ struct xfs_btree_cur *refc_cur;
+};
+
+struct xfs_scrub_context {
+ /* General scrub state. */
+ struct xfs_mount *mp;
+ struct xfs_scrub_metadata *sm;
+ const struct xfs_scrub_meta_ops *ops;
+ struct xfs_trans *tp;
+ struct xfs_inode *ip;
+ void *buf;
+ uint ilock_flags;
+ bool try_harder;
+
+ /* State tracking for single-AG operations. */
+ struct xfs_scrub_ag sa;
+};
+
+/* Metadata scrubbers */
+int xfs_scrub_tester(struct xfs_scrub_context *sc);
+int xfs_scrub_superblock(struct xfs_scrub_context *sc);
+int xfs_scrub_agf(struct xfs_scrub_context *sc);
+int xfs_scrub_agfl(struct xfs_scrub_context *sc);
+int xfs_scrub_agi(struct xfs_scrub_context *sc);
+int xfs_scrub_bnobt(struct xfs_scrub_context *sc);
+int xfs_scrub_cntbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inobt(struct xfs_scrub_context *sc);
+int xfs_scrub_finobt(struct xfs_scrub_context *sc);
+int xfs_scrub_rmapbt(struct xfs_scrub_context *sc);
+int xfs_scrub_refcountbt(struct xfs_scrub_context *sc);
+int xfs_scrub_inode(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_data(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_attr(struct xfs_scrub_context *sc);
+int xfs_scrub_bmap_cow(struct xfs_scrub_context *sc);
+int xfs_scrub_directory(struct xfs_scrub_context *sc);
+int xfs_scrub_xattr(struct xfs_scrub_context *sc);
+int xfs_scrub_symlink(struct xfs_scrub_context *sc);
+int xfs_scrub_parent(struct xfs_scrub_context *sc);
+#ifdef CONFIG_XFS_RT
+int xfs_scrub_rtbitmap(struct xfs_scrub_context *sc);
+int xfs_scrub_rtsummary(struct xfs_scrub_context *sc);
+#else
+static inline int
+xfs_scrub_rtbitmap(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+static inline int
+xfs_scrub_rtsummary(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+#endif
+#ifdef CONFIG_XFS_QUOTA
+int xfs_scrub_quota(struct xfs_scrub_context *sc);
+#else
+static inline int
+xfs_scrub_quota(struct xfs_scrub_context *sc)
+{
+ return -ENOENT;
+}
+#endif
+
+#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
new file mode 100644
index 000000000000..3aa3d60f7c16
--- /dev/null
+++ b/fs/xfs/scrub/symlink.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_symlink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Set us up to scrub a symbolic link. */
+int
+xfs_scrub_setup_symlink(
+ struct xfs_scrub_context *sc,
+ struct xfs_inode *ip)
+{
+ /* Allocate the buffer without the inode lock held. */
+ sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ return xfs_scrub_setup_inode_contents(sc, ip, 0);
+}
+
+/* Symbolic links. */
+
+int
+xfs_scrub_symlink(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_inode *ip = sc->ip;
+ struct xfs_ifork *ifp;
+ loff_t len;
+ int error = 0;
+
+ if (!S_ISLNK(VFS_I(ip)->i_mode))
+ return -ENOENT;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = ip->i_d.di_size;
+
+ /* Plausible size? */
+ if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Inline symlink? */
+ if (ifp->if_flags & XFS_IFINLINE) {
+ if (len > XFS_IFORK_DSIZE(ip) ||
+ len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out;
+ }
+
+ /* Remote symlink; must read the contents. */
+ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out;
+ if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+out:
+ return error;
+}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
new file mode 100644
index 000000000000..472080e75788
--- /dev/null
+++ b/fs/xfs/scrub/trace.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+
+/* Figure out which block the btree cursor was pointing to. */
+static inline xfs_fsblock_t
+xfs_scrub_btree_cur_fsbno(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level < cur->bc_nlevels && cur->bc_bufs[level])
+ return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
+ else if (level == cur->bc_nlevels - 1 &&
+ cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino);
+ else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
+ return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0);
+ return NULLFSBLOCK;
+}
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
new file mode 100644
index 000000000000..c4ebfb5c1ee8
--- /dev/null
+++ b/fs/xfs/scrub/trace.h
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs_scrub
+
+#if !defined(_TRACE_XFS_SCRUB_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_SCRUB_TRACE_H
+
+#include <linux/tracepoint.h>
+#include "xfs_bit.h"
+
+DECLARE_EVENT_CLASS(xfs_scrub_class,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
+ int error),
+ TP_ARGS(ip, sm, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, inum)
+ __field(unsigned int, gen)
+ __field(unsigned int, flags)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->type = sm->sm_type;
+ __entry->agno = sm->sm_agno;
+ __entry->inum = sm->sm_ino;
+ __entry->gen = sm->sm_gen;
+ __entry->flags = sm->sm_flags;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d ino %llu type %u agno %u inum %llu gen %u flags 0x%x error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->type,
+ __entry->agno,
+ __entry->inum,
+ __entry->gen,
+ __entry->flags,
+ __entry->error)
+)
+#define DEFINE_SCRUB_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_class, name, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, \
+ int error), \
+ TP_ARGS(ip, sm, error))
+
+DEFINE_SCRUB_EVENT(xfs_scrub_start);
+DEFINE_SCRUB_EVENT(xfs_scrub_done);
+DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry);
+
+TRACE_EVENT(xfs_scrub_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
+ xfs_agblock_t bno, int error, void *ret_ip),
+ TP_ARGS(sc, agno, bno, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_file_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, int error, void *ret_ip),
+ TP_ARGS(sc, whichfork, offset, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_fileoff_t, offset)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->ip->i_mount->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->offset = offset;
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->offset,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_scrub_block_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, void *ret_ip),
+ TP_ARGS(sc, daddr, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+
+ fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_BLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_block_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_daddr_t daddr, \
+ void *ret_ip), \
+ TP_ARGS(sc, daddr, ret_ip))
+
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error);
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen);
+
+DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr,
+ void *ret_ip),
+ TP_ARGS(sc, ino, daddr, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, type)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+
+ if (daddr) {
+ fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr);
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ } else {
+ agno = XFS_INO_TO_AGNO(sc->mp, ino);
+ bno = XFS_AGINO_TO_AGBNO(sc->mp,
+ XFS_INO_TO_AGINO(sc->mp, ino));
+ }
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = ino;
+ __entry->type = sc->sm->sm_type;
+ __entry->agno = agno;
+ __entry->bno = bno;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu type %u agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->type,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+)
+
+#define DEFINE_SCRUB_INO_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_ino_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \
+ xfs_daddr_t daddr, void *ret_ip), \
+ TP_ARGS(sc, ino, daddr, ret_ip))
+
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error);
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen);
+DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_warning);
+
+DECLARE_EVENT_CLASS(xfs_scrub_fblock_error_class,
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork,
+ xfs_fileoff_t offset, void *ret_ip),
+ TP_ARGS(sc, whichfork, offset, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_fileoff_t, offset)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->ip->i_mount->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->offset = offset;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u offset %llu ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->offset,
+ __entry->ret_ip)
+);
+
+#define DEFINE_SCRUB_FBLOCK_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_fblock_error_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, int whichfork, \
+ xfs_fileoff_t offset, void *ret_ip), \
+ TP_ARGS(sc, whichfork, offset, ret_ip))
+
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_error);
+DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xfs_scrub_fblock_warning);
+
+TRACE_EVENT(xfs_scrub_incomplete,
+ TP_PROTO(struct xfs_scrub_context *sc, void *ret_ip),
+ TP_ARGS(sc, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_btree_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, int error, void *ret_ip),
+ TP_ARGS(sc, cur, level, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_ifork_btree_op_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, int error, void *ret_ip),
+ TP_ARGS(sc, cur, level, error, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(int, ptr)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, error)
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->error = error;
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->error,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_btree_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, void *ret_ip),
+ TP_ARGS(sc, cur, level, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+);
+
+TRACE_EVENT(xfs_scrub_ifork_btree_error,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level, void *ret_ip),
+ TP_ARGS(sc, cur, level, ret_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(unsigned int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(int, level)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, ptr);
+ __field(void *, ret_ip)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->ip->i_ino;
+ __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->ptr = cur->bc_ptrs[level];
+ __entry->ret_ip = ret_ip;
+ ),
+ TP_printk("dev %d:%d ino %llu fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pF",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->whichfork,
+ __entry->type,
+ __entry->btnum,
+ __entry->level,
+ __entry->ptr,
+ __entry->agno,
+ __entry->bno,
+ __entry->ret_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_scrub_sbtree_class,
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur,
+ int level),
+ TP_ARGS(sc, cur, level),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bno)
+ __field(int, level)
+ __field(int, nlevels)
+ __field(int, ptr)
+ ),
+ TP_fast_assign(
+ xfs_fsblock_t fsbno = xfs_scrub_btree_cur_fsbno(cur, level);
+
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
+ __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->ptr = cur->bc_ptrs[level];
+ ),
+ TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->btnum,
+ __entry->agno,
+ __entry->bno,
+ __entry->level,
+ __entry->nlevels,
+ __entry->ptr)
+)
+#define DEFINE_SCRUB_SBTREE_EVENT(name) \
+DEFINE_EVENT(xfs_scrub_sbtree_class, name, \
+ TP_PROTO(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, \
+ int level), \
+ TP_ARGS(sc, cur, level))
+
+DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_rec);
+DEFINE_SCRUB_SBTREE_EVENT(xfs_scrub_btree_key);
+
+#endif /* _TRACE_XFS_SCRUB_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE scrub/trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
new file mode 100644
index 000000000000..e00e0eadac6a
--- /dev/null
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_H__
+#define __XFS_SCRUB_H__
+
+#ifndef CONFIG_XFS_ONLINE_SCRUB
+# define xfs_scrub_metadata(ip, sm) (-ENOTTY)
+#else
+int xfs_scrub_metadata(struct xfs_inode *ip, struct xfs_scrub_metadata *sm);
+#endif /* CONFIG_XFS_ONLINE_SCRUB */
+
+#endif /* __XFS_SCRUB_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 80cd0fd86783..5ff7f228d616 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -19,7 +19,6 @@
#define __XFS_H__
#ifdef CONFIG_XFS_DEBUG
-#define STATIC
#define DEBUG 1
#define XFS_BUF_LOCK_TRACKING 1
#endif
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7034e17535de..3354140de07e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -247,6 +247,8 @@ xfs_set_mode(struct inode *inode, umode_t mode)
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
+ umode_t mode;
+ bool set_mode = false;
int error = 0;
if (!acl)
@@ -257,16 +259,24 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return error;
if (type == ACL_TYPE_ACCESS) {
- umode_t mode;
-
error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
- error = xfs_set_mode(inode, mode);
- if (error)
- return error;
+ set_mode = true;
}
set_acl:
- return __xfs_set_acl(inode, acl, type);
+ error = __xfs_set_acl(inode, acl, type);
+ if (error)
+ return error;
+
+ /*
+ * We set the mode after successfully updating the ACL xattr because the
+ * xattr update can fail at ENOSPC and we don't want to change the mode
+ * if the ACL update hasn't been applied.
+ */
+ if (set_mode)
+ error = xfs_set_mode(inode, mode);
+
+ return error;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 29172609f2a3..a3eeaba156c5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -343,7 +343,8 @@ xfs_end_io(
error = xfs_reflink_end_cow(ip, offset, size);
break;
case XFS_IO_UNWRITTEN:
- error = xfs_iomap_write_unwritten(ip, offset, size);
+ /* writeback should never update isize */
+ error = xfs_iomap_write_unwritten(ip, offset, size, false);
break;
default:
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
@@ -445,6 +446,19 @@ xfs_imap_valid(
{
offset >>= inode->i_blkbits;
+ /*
+ * We have to make sure the cached mapping is within EOF to protect
+ * against eofblocks trimming on file release leaving us with a stale
+ * mapping. Otherwise, a page for a subsequent file extending buffered
+ * write could get picked up by this writeback cycle and written to the
+ * wrong blocks.
+ *
+ * Note that what we really want here is a generic mapping invalidation
+ * mechanism to protect us from arbitrary extent modifying contexts, not
+ * just eofblocks.
+ */
+ xfs_trim_extent_eof(imap, XFS_I(inode));
+
return offset >= imap->br_startoff &&
offset < imap->br_startoff + imap->br_blockcount;
}
@@ -734,6 +748,14 @@ xfs_vm_invalidatepage(
{
trace_xfs_invalidatepage(page->mapping->host, page, offset,
length);
+
+ /*
+ * If we are invalidating the entire page, clear the dirty state from it
+ * so that we can check for attempts to release dirty cached pages in
+ * xfs_vm_releasepage().
+ */
+ if (offset == 0 && length >= PAGE_SIZE)
+ cancel_dirty_page(page);
block_invalidatepage(page, offset, length);
}
@@ -1189,25 +1211,27 @@ xfs_vm_releasepage(
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
* ->releasepage() via shrink_active_list(). Conversely,
- * block_invalidatepage() can send pages that are still marked dirty
- * but otherwise have invalidated buffers.
+ * block_invalidatepage() can send pages that are still marked dirty but
+ * otherwise have invalidated buffers.
*
* We want to release the latter to avoid unnecessary buildup of the
- * LRU, skip the former and warn if we've left any lingering
- * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
- * or unwritten buffers and warn if the page is not dirty. Otherwise
- * try to release the buffers.
+ * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
+ * that are entirely invalidated and need to be released. Hence the
+ * only time we should get dirty pages here is through
+ * shrink_active_list() and so we can simply skip those now.
+ *
+ * warn if we've left any lingering delalloc/unwritten buffers on clean
+ * or invalidated pages we are about to release.
*/
+ if (PageDirty(page))
+ return 0;
+
xfs_count_page_state(page, &delalloc, &unwritten);
- if (delalloc) {
- WARN_ON_ONCE(!PageDirty(page));
+ if (WARN_ON_ONCE(delalloc))
return 0;
- }
- if (unwritten) {
- WARN_ON_ONCE(!PageDirty(page));
+ if (WARN_ON_ONCE(unwritten))
return 0;
- }
return try_to_free_buffers(page);
}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 5d5a5e277f35..d07bf27451c9 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -48,6 +48,8 @@ struct xfs_attr_list_context;
#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
+#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
+
#define XFS_ATTR_FLAGS \
{ ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
{ ATTR_ROOT, "ROOT" }, \
@@ -56,7 +58,8 @@ struct xfs_attr_list_context;
{ ATTR_CREATE, "CREATE" }, \
{ ATTR_REPLACE, "REPLACE" }, \
{ ATTR_KERNOTIME, "KERNOTIME" }, \
- { ATTR_KERNOVAL, "KERNOVAL" }
+ { ATTR_KERNOVAL, "KERNOVAL" }, \
+ { ATTR_INCOMPLETE, "INCOMPLETE" }
/*
* The maximum size (into the kernel or returned from the kernel) of an
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index ebd66b19fbfc..52818ea2eb50 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -251,47 +251,44 @@ xfs_attr3_node_inactive(
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
- XFS_ATTR_FORK);
+ error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp,
+ XFS_ATTR_FORK);
if (error)
return error;
- if (child_bp) {
- /* save for re-read later */
- child_blkno = XFS_BUF_ADDR(child_bp);
- /*
- * Invalidate the subtree, however we have to.
- */
- info = child_bp->b_addr;
- switch (info->magic) {
- case cpu_to_be16(XFS_DA_NODE_MAGIC):
- case cpu_to_be16(XFS_DA3_NODE_MAGIC):
- error = xfs_attr3_node_inactive(trans, dp,
- child_bp, level + 1);
- break;
- case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
- case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
- error = xfs_attr3_leaf_inactive(trans, dp,
- child_bp);
- break;
- default:
- error = -EIO;
- xfs_trans_brelse(*trans, child_bp);
- break;
- }
- if (error)
- return error;
+ /* save for re-read later */
+ child_blkno = XFS_BUF_ADDR(child_bp);
- /*
- * Remove the subsidiary block from the cache
- * and from the log.
- */
- error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
- &child_bp, XFS_ATTR_FORK);
- if (error)
- return error;
- xfs_trans_binval(*trans, child_bp);
+ /*
+ * Invalidate the subtree, however we have to.
+ */
+ info = child_bp->b_addr;
+ switch (info->magic) {
+ case cpu_to_be16(XFS_DA_NODE_MAGIC):
+ case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+ error = xfs_attr3_node_inactive(trans, dp, child_bp,
+ level + 1);
+ break;
+ case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+ case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+ error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
+ break;
+ default:
+ error = -EIO;
+ xfs_trans_brelse(*trans, child_bp);
+ break;
}
+ if (error)
+ return error;
+
+ /*
+ * Remove the subsidiary block from the cache and from the log.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+ xfs_trans_binval(*trans, child_bp);
/*
* If we're not done, re-read the parent to get the next
@@ -302,6 +299,8 @@ xfs_attr3_node_inactive(
&bp, XFS_ATTR_FORK);
if (error)
return error;
+ node = bp->b_addr;
+ btree = dp->d_ops->node_tree_p(node);
child_fsb = be32_to_cpu(btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 7740c8a5e736..3e59a348ea71 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -204,19 +204,103 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
return 0;
}
+/*
+ * We didn't find the block & hash mentioned in the cursor state, so
+ * walk down the attr btree looking for the hash.
+ */
STATIC int
-xfs_attr_node_list(xfs_attr_list_context_t *context)
+xfs_attr_node_list_lookup(
+ struct xfs_attr_list_context *context,
+ struct attrlist_cursor_kern *cursor,
+ struct xfs_buf **pbp)
{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_leafblock_t *leaf;
- xfs_da_intnode_t *node;
- struct xfs_attr3_icleaf_hdr leafhdr;
- struct xfs_da3_icnode_hdr nodehdr;
- struct xfs_da_node_entry *btree;
- int error, i;
- struct xfs_buf *bp;
- struct xfs_inode *dp = context->dp;
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da_intnode *node;
+ struct xfs_da_node_entry *btree;
+ struct xfs_inode *dp = context->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_trans *tp = context->tp;
+ struct xfs_buf *bp;
+ int i;
+ int error = 0;
+ unsigned int expected_level = 0;
+ uint16_t magic;
+
+ ASSERT(*pbp == NULL);
+ cursor->blkno = 0;
+ for (;;) {
+ error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+ node = bp->b_addr;
+ magic = be16_to_cpu(node->hdr.info.magic);
+ if (magic == XFS_ATTR_LEAF_MAGIC ||
+ magic == XFS_ATTR3_LEAF_MAGIC)
+ break;
+ if (magic != XFS_DA_NODE_MAGIC &&
+ magic != XFS_DA3_NODE_MAGIC) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ node);
+ goto out_corruptbuf;
+ }
+
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+
+ /* Tree taller than we can handle; bail out! */
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ goto out_corruptbuf;
+
+ /* Check the level from the root node. */
+ if (cursor->blkno == 0)
+ expected_level = nodehdr.level - 1;
+ else if (expected_level != nodehdr.level)
+ goto out_corruptbuf;
+ else
+ expected_level--;
+
+ btree = dp->d_ops->node_tree_p(node);
+ for (i = 0; i < nodehdr.count; btree++, i++) {
+ if (cursor->hashval <= be32_to_cpu(btree->hashval)) {
+ cursor->blkno = be32_to_cpu(btree->before);
+ trace_xfs_attr_list_node_descend(context,
+ btree);
+ break;
+ }
+ }
+ xfs_trans_brelse(tp, bp);
+
+ if (i == nodehdr.count)
+ return 0;
+
+ /* We can't point back to the root. */
+ if (cursor->blkno == 0)
+ return -EFSCORRUPTED;
+ }
+
+ if (expected_level != 0)
+ goto out_corruptbuf;
+
+ *pbp = bp;
+ return 0;
+
+out_corruptbuf:
+ xfs_trans_brelse(tp, bp);
+ return -EFSCORRUPTED;
+}
+
+STATIC int
+xfs_attr_node_list(
+ struct xfs_attr_list_context *context)
+{
+ struct xfs_attr3_icleaf_hdr leafhdr;
+ struct attrlist_cursor_kern *cursor;
+ struct xfs_attr_leafblock *leaf;
+ struct xfs_da_intnode *node;
+ struct xfs_buf *bp;
+ struct xfs_inode *dp = context->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
trace_xfs_attr_node_list(context);
@@ -277,47 +361,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
* Note that start of node block is same as start of leaf block.
*/
if (bp == NULL) {
- cursor->blkno = 0;
- for (;;) {
- uint16_t magic;
-
- error = xfs_da3_node_read(context->tp, dp,
- cursor->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error)
- return error;
- node = bp->b_addr;
- magic = be16_to_cpu(node->hdr.info.magic);
- if (magic == XFS_ATTR_LEAF_MAGIC ||
- magic == XFS_ATTR3_LEAF_MAGIC)
- break;
- if (magic != XFS_DA_NODE_MAGIC &&
- magic != XFS_DA3_NODE_MAGIC) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount,
- node);
- xfs_trans_brelse(context->tp, bp);
- return -EFSCORRUPTED;
- }
-
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
- for (i = 0; i < nodehdr.count; btree++, i++) {
- if (cursor->hashval
- <= be32_to_cpu(btree->hashval)) {
- cursor->blkno = be32_to_cpu(btree->before);
- trace_xfs_attr_list_node_descend(context,
- btree);
- break;
- }
- }
- if (i == nodehdr.count) {
- xfs_trans_brelse(context->tp, bp);
- return 0;
- }
- xfs_trans_brelse(context->tp, bp);
- }
+ error = xfs_attr_node_list_lookup(context, cursor, &bp);
+ if (error || !bp)
+ return error;
}
ASSERT(bp != NULL);
@@ -407,7 +453,8 @@ xfs_attr3_leaf_list_int(
cursor->offset = 0;
}
- if (entry->flags & XFS_ATTR_INCOMPLETE)
+ if ((entry->flags & XFS_ATTR_INCOMPLETE) &&
+ !(context->flags & ATTR_INCOMPLETE))
continue; /* skip incomplete entries */
if (entry->flags & XFS_ATTR_LOCAL) {
@@ -499,8 +546,8 @@ xfs_attr_list_int(
#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
(((struct attrlist_ent *) 0)->a_name - (char *) 0)
#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
- & ~(sizeof(u_int32_t)-1))
+ ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \
+ & ~(sizeof(uint32_t)-1))
/*
* Format an attribute and copy it out to the user's buffer.
@@ -583,6 +630,10 @@ xfs_attr_list(
(cursor->hashval || cursor->blkno || cursor->offset))
return -EINVAL;
+ /* Only internal consumers can retrieve incomplete attrs. */
+ if (flags & ATTR_INCOMPLETE)
+ return -EINVAL;
+
/*
* Check for a properly aligned buffer.
*/
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index cd9a5400ba4f..6d37ab43195f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -84,6 +84,7 @@ xfs_zero_extent(
GFP_NOFS, 0);
}
+#ifdef CONFIG_XFS_RT
int
xfs_bmap_rtalloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
@@ -190,6 +191,7 @@ xfs_bmap_rtalloc(
}
return 0;
}
+#endif /* CONFIG_XFS_RT */
/*
* Check if the endoff is outside the last extent. If so the caller will grow
@@ -227,15 +229,17 @@ xfs_bmap_count_leaves(
struct xfs_ifork *ifp,
xfs_filblks_t *count)
{
+ struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got;
- xfs_extnum_t numrecs = 0, i = 0;
+ xfs_extnum_t numrecs = 0;
- while (xfs_iext_get_extent(ifp, i++, &got)) {
+ for_each_xfs_iext(ifp, &icur, &got) {
if (!isnullstartblock(got.br_startblock)) {
*count += got.br_blockcount;
numrecs++;
}
}
+
return numrecs;
}
@@ -403,125 +407,103 @@ xfs_bmap_count_blocks(
return 0;
}
-/*
- * returns 1 for success, 0 if we failed to map the extent.
- */
-STATIC int
-xfs_getbmapx_fix_eof_hole(
- xfs_inode_t *ip, /* xfs incore inode pointer */
- int whichfork,
- struct getbmapx *out, /* output structure */
- int prealloced, /* this is a file with
- * preallocated data space */
- int64_t end, /* last block requested */
- xfs_fsblock_t startblock,
- bool moretocome)
+static int
+xfs_getbmap_report_one(
+ struct xfs_inode *ip,
+ struct getbmapx *bmv,
+ struct kgetbmap *out,
+ int64_t bmv_end,
+ struct xfs_bmbt_irec *got)
{
- int64_t fixlen;
- xfs_mount_t *mp; /* file system mount point */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent pointer */
- xfs_fileoff_t fileblock;
-
- if (startblock == HOLESTARTBLOCK) {
- mp = ip->i_mount;
- out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
- fixlen -= out->bmv_offset;
- if (prealloced && out->bmv_offset + out->bmv_length == end) {
- /* Came to hole at EOF. Trim it. */
- if (fixlen <= 0)
- return 0;
- out->bmv_length = fixlen;
- }
+ struct kgetbmap *p = out + bmv->bmv_entries;
+ bool shared = false, trimmed = false;
+ int error;
+
+ error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+ if (error)
+ return error;
+
+ if (isnullstartblock(got->br_startblock) ||
+ got->br_startblock == DELAYSTARTBLOCK) {
+ /*
+ * Delalloc extents that start beyond EOF can occur due to
+ * speculative EOF allocation when the delalloc extent is larger
+ * than the largest freespace extent at conversion time. These
+ * extents cannot be converted by data writeback, so can exist
+ * here even if we are not supposed to be finding delalloc
+ * extents.
+ */
+ if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
+ ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
+
+ p->bmv_oflags |= BMV_OF_DELALLOC;
+ p->bmv_block = -2;
} else {
- if (startblock == DELAYSTARTBLOCK)
- out->bmv_block = -2;
- else
- out->bmv_block = xfs_fsb_to_db(ip, startblock);
- fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!moretocome &&
- xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
- (lastx == xfs_iext_count(ifp) - 1))
- out->bmv_oflags |= BMV_OF_LAST;
+ p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
}
- return 1;
+ if (got->br_state == XFS_EXT_UNWRITTEN &&
+ (bmv->bmv_iflags & BMV_IF_PREALLOC))
+ p->bmv_oflags |= BMV_OF_PREALLOC;
+
+ if (shared)
+ p->bmv_oflags |= BMV_OF_SHARED;
+
+ p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
+ p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
+
+ bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+ bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+ bmv->bmv_entries++;
+ return 0;
}
-/* Adjust the reported bmap around shared/unshared extent transitions. */
-STATIC int
-xfs_getbmap_adjust_shared(
- struct xfs_inode *ip,
- int whichfork,
- struct xfs_bmbt_irec *map,
- struct getbmapx *out,
- struct xfs_bmbt_irec *next_map)
+static void
+xfs_getbmap_report_hole(
+ struct xfs_inode *ip,
+ struct getbmapx *bmv,
+ struct kgetbmap *out,
+ int64_t bmv_end,
+ xfs_fileoff_t bno,
+ xfs_fileoff_t end)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_agblock_t ebno;
- xfs_extlen_t elen;
- xfs_extlen_t nlen;
- int error;
+ struct kgetbmap *p = out + bmv->bmv_entries;
- next_map->br_startblock = NULLFSBLOCK;
- next_map->br_startoff = NULLFILEOFF;
- next_map->br_blockcount = 0;
+ if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
+ return;
- /* Only written data blocks can be shared. */
- if (!xfs_is_reflink_inode(ip) ||
- whichfork != XFS_DATA_FORK ||
- !xfs_bmap_is_real_extent(map))
- return 0;
+ p->bmv_block = -1;
+ p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
+ p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
- agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
- error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
- map->br_blockcount, &ebno, &elen, true);
- if (error)
- return error;
+ bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+ bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+ bmv->bmv_entries++;
+}
- if (ebno == NULLAGBLOCK) {
- /* No shared blocks at all. */
- return 0;
- } else if (agbno == ebno) {
- /*
- * Shared extent at (agbno, elen). Shrink the reported
- * extent length and prepare to move the start of map[i]
- * to agbno+elen, with the aim of (re)formatting the new
- * map[i] the next time through the inner loop.
- */
- out->bmv_length = XFS_FSB_TO_BB(mp, elen);
- out->bmv_oflags |= BMV_OF_SHARED;
- if (elen != map->br_blockcount) {
- *next_map = *map;
- next_map->br_startblock += elen;
- next_map->br_startoff += elen;
- next_map->br_blockcount -= elen;
- }
- map->br_blockcount -= elen;
- } else {
- /*
- * There's an unshared extent (agbno, ebno - agbno)
- * followed by shared extent at (ebno, elen). Shrink
- * the reported extent length to cover only the unshared
- * extent and prepare to move up the start of map[i] to
- * ebno, with the aim of (re)formatting the new map[i]
- * the next time through the inner loop.
- */
- *next_map = *map;
- nlen = ebno - agbno;
- out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
- next_map->br_startblock += nlen;
- next_map->br_startoff += nlen;
- next_map->br_blockcount -= nlen;
- map->br_blockcount -= nlen;
- }
+static inline bool
+xfs_getbmap_full(
+ struct getbmapx *bmv)
+{
+ return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
+}
- return 0;
+static bool
+xfs_getbmap_next_rec(
+ struct xfs_bmbt_irec *rec,
+ xfs_fileoff_t total_end)
+{
+ xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount;
+
+ if (end == total_end)
+ return false;
+
+ rec->br_startoff += rec->br_blockcount;
+ if (!isnullstartblock(rec->br_startblock) &&
+ rec->br_startblock != DELAYSTARTBLOCK)
+ rec->br_startblock += rec->br_blockcount;
+ rec->br_blockcount = total_end - end;
+ return true;
}
/*
@@ -533,33 +515,22 @@ xfs_getbmap_adjust_shared(
*/
int /* error code */
xfs_getbmap(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
struct getbmapx *bmv, /* user bmap structure */
- xfs_bmap_format_t formatter, /* format to user */
- void *arg) /* formatter arg */
+ struct kgetbmap *out)
{
- int64_t bmvend; /* last block requested */
- int error = 0; /* return value */
- int64_t fixlen; /* length for -1 case */
- int i; /* extent number */
- int lock; /* lock state */
- xfs_bmbt_irec_t *map; /* buffer for user's data */
- xfs_mount_t *mp; /* file system mount point */
- int nex; /* # of user extents can do */
- int subnex; /* # of bmapi's can do */
- int nmap; /* number of map entries */
- struct getbmapx *out; /* output structure */
- int whichfork; /* data or attr fork */
- int prealloced; /* this is a file with
- * preallocated data space */
- int iflags; /* interface flags */
- int bmapi_flags; /* flags for xfs_bmapi */
- int cur_ext = 0;
- struct xfs_bmbt_irec inject_map;
-
- mp = ip->i_mount;
- iflags = bmv->bmv_iflags;
-
+ struct xfs_mount *mp = ip->i_mount;
+ int iflags = bmv->bmv_iflags;
+ int whichfork, lock, error = 0;
+ int64_t bmv_end, max_len;
+ xfs_fileoff_t bno, first_bno;
+ struct xfs_ifork *ifp;
+ struct xfs_bmbt_irec got, rec;
+ xfs_filblks_t len;
+ struct xfs_iext_cursor icur;
+
+ if (bmv->bmv_iflags & ~BMV_IF_VALID)
+ return -EINVAL;
#ifndef DEBUG
/* Only allow CoW fork queries if we're debugging. */
if (iflags & BMV_IF_COWFORK)
@@ -568,89 +539,42 @@ xfs_getbmap(
if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
return -EINVAL;
+ if (bmv->bmv_length < -1)
+ return -EINVAL;
+ bmv->bmv_entries = 0;
+ if (bmv->bmv_length == 0)
+ return 0;
+
if (iflags & BMV_IF_ATTRFORK)
whichfork = XFS_ATTR_FORK;
else if (iflags & BMV_IF_COWFORK)
whichfork = XFS_COW_FORK;
else
whichfork = XFS_DATA_FORK;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
switch (whichfork) {
case XFS_ATTR_FORK:
- if (XFS_IFORK_Q(ip)) {
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return -EINVAL;
- } else if (unlikely(
- ip->i_d.di_aformat != 0 &&
- ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
- XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
+ if (!XFS_IFORK_Q(ip))
+ goto out_unlock_iolock;
- prealloced = 0;
- fixlen = 1LL << 32;
+ max_len = 1LL << 32;
+ lock = xfs_ilock_attr_map_shared(ip);
break;
case XFS_COW_FORK:
- if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
- return -EINVAL;
+ /* No CoW fork? Just return */
+ if (!ifp)
+ goto out_unlock_iolock;
- if (xfs_get_cowextsz_hint(ip)) {
- prealloced = 1;
- fixlen = mp->m_super->s_maxbytes;
- } else {
- prealloced = 0;
- fixlen = XFS_ISIZE(ip);
- }
- break;
- default:
- /* Local format data forks report no extents. */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
- bmv->bmv_entries = 0;
- return 0;
- }
- if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
- return -EINVAL;
+ if (xfs_get_cowextsz_hint(ip))
+ max_len = mp->m_super->s_maxbytes;
+ else
+ max_len = XFS_ISIZE(ip);
- if (xfs_get_extsz_hint(ip) ||
- ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
- prealloced = 1;
- fixlen = mp->m_super->s_maxbytes;
- } else {
- prealloced = 0;
- fixlen = XFS_ISIZE(ip);
- }
+ lock = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lock);
break;
- }
-
- if (bmv->bmv_length == -1) {
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
- bmv->bmv_length =
- max_t(int64_t, fixlen - bmv->bmv_offset, 0);
- } else if (bmv->bmv_length == 0) {
- bmv->bmv_entries = 0;
- return 0;
- } else if (bmv->bmv_length < 0) {
- return -EINVAL;
- }
-
- nex = bmv->bmv_count - 1;
- if (nex <= 0)
- return -EINVAL;
- bmvend = bmv->bmv_offset + bmv->bmv_length;
-
-
- if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
- return -ENOMEM;
- out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
- if (!out)
- return -ENOMEM;
-
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- switch (whichfork) {
case XFS_DATA_FORK:
if (!(iflags & BMV_IF_DELALLOC) &&
(ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
@@ -668,154 +592,105 @@ xfs_getbmap(
*/
}
+ if (xfs_get_extsz_hint(ip) ||
+ (ip->i_d.di_flags &
+ (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
+ max_len = mp->m_super->s_maxbytes;
+ else
+ max_len = XFS_ISIZE(ip);
+
lock = xfs_ilock_data_map_shared(ip);
break;
- case XFS_COW_FORK:
- lock = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lock);
- break;
- case XFS_ATTR_FORK:
- lock = xfs_ilock_attr_map_shared(ip);
- break;
}
- /*
- * Don't let nex be bigger than the number of extents
- * we can have assuming alternating holes and real extents.
- */
- if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
- nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-
- bmapi_flags = xfs_bmapi_aflag(whichfork);
- if (!(iflags & BMV_IF_PREALLOC))
- bmapi_flags |= XFS_BMAPI_IGSTATE;
-
- /*
- * Allocate enough space to handle "subnex" maps at a time.
- */
- error = -ENOMEM;
- subnex = 16;
- map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
- if (!map)
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /* Local format inode forks report no extents. */
goto out_unlock_ilock;
+ default:
+ error = -EINVAL;
+ goto out_unlock_ilock;
+ }
- bmv->bmv_entries = 0;
-
- if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
- (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
- error = 0;
- goto out_free_map;
+ if (bmv->bmv_length == -1) {
+ max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
+ bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
}
- do {
- nmap = (nex> subnex) ? subnex : nex;
- error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
- XFS_BB_TO_FSB(mp, bmv->bmv_length),
- map, &nmap, bmapi_flags);
- if (error)
- goto out_free_map;
- ASSERT(nmap <= subnex);
-
- for (i = 0; i < nmap && bmv->bmv_length &&
- cur_ext < bmv->bmv_count - 1; i++) {
- out[cur_ext].bmv_oflags = 0;
- if (map[i].br_state == XFS_EXT_UNWRITTEN)
- out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
- else if (map[i].br_startblock == DELAYSTARTBLOCK)
- out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
- out[cur_ext].bmv_offset =
- XFS_FSB_TO_BB(mp, map[i].br_startoff);
- out[cur_ext].bmv_length =
- XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- out[cur_ext].bmv_unused1 = 0;
- out[cur_ext].bmv_unused2 = 0;
+ bmv_end = bmv->bmv_offset + bmv->bmv_length;
- /*
- * delayed allocation extents that start beyond EOF can
- * occur due to speculative EOF allocation when the
- * delalloc extent is larger than the largest freespace
- * extent at conversion time. These extents cannot be
- * converted by data writeback, so can exist here even
- * if we are not supposed to be finding delalloc
- * extents.
- */
- if (map[i].br_startblock == DELAYSTARTBLOCK &&
- map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
- ASSERT((iflags & BMV_IF_DELALLOC) != 0);
-
- if (map[i].br_startblock == HOLESTARTBLOCK &&
- whichfork == XFS_ATTR_FORK) {
- /* came to the end of attribute fork */
- out[cur_ext].bmv_oflags |= BMV_OF_LAST;
- goto out_free_map;
- }
+ first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
+ len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
- /* Is this a shared block? */
- error = xfs_getbmap_adjust_shared(ip, whichfork,
- &map[i], &out[cur_ext], &inject_map);
- if (error)
- goto out_free_map;
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, whichfork);
+ if (error)
+ goto out_unlock_ilock;
+ }
- if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
- &out[cur_ext], prealloced, bmvend,
- map[i].br_startblock,
- inject_map.br_startblock != NULLFSBLOCK))
- goto out_free_map;
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
+ /*
+ * Report a whole-file hole if the delalloc flag is set to
+ * stay compatible with the old implementation.
+ */
+ if (iflags & BMV_IF_DELALLOC)
+ xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
+ XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+ goto out_unlock_ilock;
+ }
- bmv->bmv_offset =
- out[cur_ext].bmv_offset +
- out[cur_ext].bmv_length;
- bmv->bmv_length =
- max_t(int64_t, 0, bmvend - bmv->bmv_offset);
+ while (!xfs_getbmap_full(bmv)) {
+ xfs_trim_extent(&got, first_bno, len);
- /*
- * In case we don't want to return the hole,
- * don't increase cur_ext so that we can reuse
- * it in the next loop.
- */
- if ((iflags & BMV_IF_NO_HOLES) &&
- map[i].br_startblock == HOLESTARTBLOCK) {
- memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
- continue;
- }
+ /*
+ * Report an entry for a hole if this extent doesn't directly
+ * follow the previous one.
+ */
+ if (got.br_startoff > bno) {
+ xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
+ got.br_startoff);
+ if (xfs_getbmap_full(bmv))
+ break;
+ }
- /*
- * In order to report shared extents accurately,
- * we report each distinct shared/unshared part
- * of a single bmbt record using multiple bmap
- * extents. To make that happen, we iterate the
- * same map array item multiple times, each
- * time trimming out the subextent that we just
- * reported.
- *
- * Because of this, we must check the out array
- * index (cur_ext) directly against bmv_count-1
- * to avoid overflows.
- */
- if (inject_map.br_startblock != NULLFSBLOCK) {
- map[i] = inject_map;
- i--;
+ /*
+ * In order to report shared extents accurately, we report each
+ * distinct shared / unshared part of a single bmbt record with
+ * an individual getbmapx record.
+ */
+ bno = got.br_startoff + got.br_blockcount;
+ rec = got;
+ do {
+ error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
+ &rec);
+ if (error || xfs_getbmap_full(bmv))
+ goto out_unlock_ilock;
+ } while (xfs_getbmap_next_rec(&rec, bno));
+
+ if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+ xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+ out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
+
+ if (whichfork != XFS_ATTR_FORK && bno < end &&
+ !xfs_getbmap_full(bmv)) {
+ xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
+ bno, end);
}
- bmv->bmv_entries++;
- cur_ext++;
+ break;
}
- } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
- out_free_map:
- kmem_free(map);
- out_unlock_ilock:
- xfs_iunlock(ip, lock);
- out_unlock_iolock:
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- for (i = 0; i < cur_ext; i++) {
- /* format results & advance arg */
- error = formatter(&arg, &out[i]);
- if (error)
+ if (bno >= first_bno + len)
break;
}
- kmem_free(out);
+out_unlock_ilock:
+ xfs_iunlock(ip, lock);
+out_unlock_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return error;
}
@@ -1387,53 +1262,12 @@ out:
}
-/*
- * @next_fsb will keep track of the extent currently undergoing shift.
- * @stop_fsb will keep track of the extent at which we have to stop.
- * If we are shifting left, we will start with block (offset + len) and
- * shift each extent till last extent.
- * If we are shifting right, we will start with last extent inside file space
- * and continue until we reach the block corresponding to offset.
- */
static int
-xfs_shift_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len,
- enum shift_direction direction)
+xfs_prepare_shift(
+ struct xfs_inode *ip,
+ loff_t offset)
{
- int done = 0;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
int error;
- struct xfs_defer_ops dfops;
- xfs_fsblock_t first_block;
- xfs_fileoff_t stop_fsb;
- xfs_fileoff_t next_fsb;
- xfs_fileoff_t shift_fsb;
- uint resblks;
-
- ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
-
- if (direction == SHIFT_LEFT) {
- /*
- * Reserve blocks to cover potential extent merges after left
- * shift operations.
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- next_fsb = XFS_B_TO_FSB(mp, offset + len);
- stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
- } else {
- /*
- * If right shift, delegate the work of initialization of
- * next_fsb to xfs_bmap_shift_extent as it has ilock held.
- */
- resblks = 0;
- next_fsb = NULLFSBLOCK;
- stop_fsb = XFS_B_TO_FSB(mp, offset);
- }
-
- shift_fsb = XFS_B_TO_FSB(mp, len);
/*
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
@@ -1449,8 +1283,7 @@ xfs_shift_file_space(
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- offset, -1);
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
if (error)
return error;
error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
@@ -1459,16 +1292,62 @@ xfs_shift_file_space(
return error;
/*
- * The extent shiting code works on extent granularity. So, if
- * stop_fsb is not the starting block of extent, we need to split
- * the extent at stop_fsb.
+ * Clean out anything hanging around in the cow fork now that
+ * we've flushed all the dirty data out to disk to avoid having
+ * CoW extents at the wrong offsets.
*/
- if (direction == SHIFT_RIGHT) {
- error = xfs_bmap_split_extent(ip, stop_fsb);
+ if (xfs_is_reflink_inode(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
+ true);
if (error)
return error;
}
+ return 0;
+}
+
+/*
+ * xfs_collapse_file_space()
+ * This routine frees disk space and shift extent for the given file.
+ * The first thing we do is to free data blocks in the specified range
+ * by calling xfs_free_file_space(). It would also sync dirty data
+ * and invalidate page cache over the region on which collapse range
+ * is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+ xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
+ uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ bool done = false;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
+ trace_xfs_collapse_file_space(ip);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ return error;
+
+ error = xfs_prepare_shift(ip, offset);
+ if (error)
+ return error;
+
while (!error && !done) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
&tp);
@@ -1481,25 +1360,17 @@ xfs_shift_file_space(
XFS_QMOPT_RES_REGBLKS);
if (error)
goto out_trans_cancel;
-
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_defer_init(&dfops, &first_block);
-
- /*
- * We are using the write transaction in which max 2 bmbt
- * updates are allowed
- */
- error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
- &done, stop_fsb, &first_block, &dfops,
- direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
+ error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
+ &done, stop_fsb, &first_block, &dfops);
if (error)
goto out_bmap_cancel;
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
-
error = xfs_trans_commit(tp);
}
@@ -1513,36 +1384,6 @@ out_trans_cancel:
}
/*
- * xfs_collapse_file_space()
- * This routine frees disk space and shift extent for the given file.
- * The first thing we do is to free data blocks in the specified range
- * by calling xfs_free_file_space(). It would also sync dirty data
- * and invalidate page cache over the region on which collapse range
- * is working. And Shift extent records to the left to cover a hole.
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-int
-xfs_collapse_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len)
-{
- int error;
-
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- trace_xfs_collapse_file_space(ip);
-
- error = xfs_free_file_space(ip, offset, len);
- if (error)
- return error;
-
- return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
-}
-
-/*
* xfs_insert_file_space()
* This routine create hole space by shifting extents for the given file.
* The first thing we do is to sync dirty data and invalidate page cache
@@ -1560,10 +1401,60 @@ xfs_insert_file_space(
loff_t offset,
loff_t len)
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset);
+ xfs_fileoff_t next_fsb = NULLFSBLOCK;
+ xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
+ bool done = false;
+
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+
trace_xfs_insert_file_space(ip);
- return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+ error = xfs_prepare_shift(ip, offset);
+ if (error)
+ return error;
+
+ /*
+ * The extent shifting code works on extent granularity. So, if stop_fsb
+ * is not the starting block of extent, we need to split the extent at
+ * stop_fsb.
+ */
+ error = xfs_bmap_split_extent(ip, stop_fsb);
+ if (error)
+ return error;
+
+ while (!error && !done) {
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0,
+ &tp);
+ if (error)
+ break;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_defer_init(&dfops, &first_block);
+ error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
+ &done, stop_fsb, &first_block, &dfops);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_defer_finish(&tp, &dfops);
+ if (error)
+ goto out_bmap_cancel;
+ error = xfs_trans_commit(tp);
+ }
+
+ return error;
+
+out_bmap_cancel:
+ xfs_defer_cancel(&dfops);
+ xfs_trans_cancel(tp);
+ return error;
}
/*
@@ -1818,7 +1709,6 @@ xfs_swap_extent_forks(
xfs_filblks_t aforkblks = 0;
xfs_filblks_t taforkblks = 0;
xfs_extnum_t junk;
- xfs_extnum_t nextents;
uint64_t tmp;
int error;
@@ -1893,13 +1783,6 @@ xfs_swap_extent_forks(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /*
- * If the extents fit in the inode, fix the pointer. Otherwise
- * it's already NULL or pointing to the extent.
- */
- nextents = xfs_iext_count(&ip->i_df);
- if (nextents <= XFS_INLINE_EXTS)
- ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -1911,13 +1794,6 @@ xfs_swap_extent_forks(
switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /*
- * If the extents fit in the inode, fix the pointer. Otherwise
- * it's already NULL or pointing to the extent.
- */
- nextents = xfs_iext_count(&tip->i_df);
- if (nextents <= XFS_INLINE_EXTS)
- tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext;
(*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -2110,11 +1986,31 @@ xfs_swap_extents(
ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+ }
+
+ /* Swap the cow forks. */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ xfs_extnum_t extnum;
+
+ ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+
+ extnum = ip->i_cnextents;
+ ip->i_cnextents = tip->i_cnextents;
+ tip->i_cnextents = extnum;
+
cowfp = ip->i_cowfp;
ip->i_cowfp = tip->i_cowfp;
tip->i_cowfp = cowfp;
- xfs_inode_set_cowblocks_tag(ip);
- xfs_inode_set_cowblocks_tag(tip);
+
+ if (ip->i_cowfp && ip->i_cnextents)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+ if (tip->i_cowfp && tip->i_cnextents)
+ xfs_inode_set_cowblocks_tag(tip);
+ else
+ xfs_inode_clear_cowblocks_tag(tip);
}
xfs_trans_log_inode(tp, ip, src_log_flags);
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0eaa81dc49be..4d4ae48bd4f6 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -28,16 +28,33 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+#else /* !CONFIG_XFS_RT */
+/*
+ * Attempts to allocate RT extents when RT is disable indicates corruption and
+ * should trigger a shutdown.
+ */
+static inline int
+xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
+{
+ return -EFSCORRUPTED;
+}
+#endif /* CONFIG_XFS_RT */
+
int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
int whichfork, int *eof);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
-/* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *);
+struct kgetbmap {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+ __s64 bmv_block; /* starting block (64-bit daddr_t) */
+ __s64 bmv_length; /* length of segment, blocks */
+ __s32 bmv_oflags; /* output flags */
+};
int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
- xfs_bmap_format_t formatter, void *arg);
+ struct kgetbmap *out);
/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index da14658da310..4db6e8d780f6 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -42,6 +42,8 @@
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
static kmem_zone_t *xfs_buf_zone;
@@ -1258,8 +1260,6 @@ xfs_buf_ioapply_map(
int size;
int offset;
- total_nr_pages = bp->b_page_count;
-
/* skip the pages in the buffer before the start offset */
page_index = 0;
offset = *buf_offset;
@@ -2131,3 +2131,17 @@ xfs_buf_terminate(void)
{
kmem_zone_destroy(xfs_buf_zone);
}
+
+void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
+{
+ /*
+ * Set the lru reference count to 0 based on the error injection tag.
+ * This allows userspace to disrupt buffer caching for debug/testing
+ * purposes.
+ */
+ if (XFS_TEST_ERROR(false, bp->b_target->bt_mount,
+ XFS_ERRTAG_BUF_LRU_REF))
+ lru_ref = 0;
+
+ atomic_set(&bp->b_lru_ref, lru_ref);
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index bf71507ddb16..f873bb786824 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -352,10 +352,7 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
-static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
-{
- atomic_set(&bp->b_lru_ref, lru_ref);
-}
+void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
static inline int xfs_buf_ispinned(struct xfs_buf *bp)
{
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index ba2638d37031..0c58918bc0ad 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -41,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = {
DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
};
-static unsigned char
+unsigned char
xfs_dir3_get_dtype(
struct xfs_mount *mp,
uint8_t filetype)
@@ -266,7 +266,7 @@ xfs_dir2_leaf_readbuf(
xfs_dablk_t next_ra;
xfs_dablk_t map_off;
xfs_dablk_t last_da;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
int ra_want;
int error = 0;
@@ -283,7 +283,7 @@ xfs_dir2_leaf_readbuf(
*/
last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
- if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map))
+ if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map))
goto out;
if (map.br_startoff >= last_da)
goto out;
@@ -311,7 +311,7 @@ xfs_dir2_leaf_readbuf(
if (next_ra >= last_da)
goto out_no_ra;
if (map.br_blockcount < geo->fsbcount &&
- !xfs_iext_get_extent(ifp, ++idx, &map))
+ !xfs_iext_next_extent(ifp, &icur, &map))
goto out_no_ra;
if (map.br_startoff >= last_da)
goto out_no_ra;
@@ -334,7 +334,7 @@ xfs_dir2_leaf_readbuf(
ra_want -= geo->fsbcount;
next_ra += geo->fsbcount;
}
- if (!xfs_iext_get_extent(ifp, ++idx, &map)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &map)) {
*ra_blk = last_da;
break;
}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index 0f070f9e44e1..de92d9cc958f 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef XFS_DISCARD_H
#define XFS_DISCARD_H 1
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index cd82429d8df7..d57c2db64e59 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -53,13 +53,6 @@
* otherwise by the lowest id first, see xfs_dqlock2.
*/
-#ifdef DEBUG
-xfs_buftarg_t *xfs_dqerror_target;
-int xfs_do_dqerror;
-int xfs_dqreq_num;
-int xfs_dqerror_mod = 33;
-#endif
-
struct kmem_zone *xfs_qm_dqtrxzone;
static struct kmem_zone *xfs_qm_dqzone;
@@ -703,7 +696,7 @@ xfs_dq_get_next_id(
xfs_dqid_t next_id = *id + 1; /* simple advance */
uint lock_flags;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor cur;
xfs_fsblock_t start;
int error = 0;
@@ -727,7 +720,7 @@ xfs_dq_get_next_id(
return error;
}
- if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &idx, &got)) {
+ if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &cur, &got)) {
/* contiguous chunk, bump startoff for the id calculation */
if (got.br_startoff < start)
got.br_startoff = start;
@@ -770,15 +763,6 @@ xfs_qm_dqget(
return -ESRCH;
}
-#ifdef DEBUG
- if (xfs_do_dqerror) {
- if ((xfs_dqerror_target == mp->m_ddev_targp) &&
- (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
- xfs_debug(mp, "Returning error in dqget");
- return -EIO;
- }
- }
-
ASSERT(type == XFS_DQ_USER ||
type == XFS_DQ_PROJ ||
type == XFS_DQ_GROUP);
@@ -786,7 +770,6 @@ xfs_qm_dqget(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(xfs_inode_dquot(ip, type) == NULL);
}
-#endif
restart:
mutex_lock(&qi->qi_tree_lock);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index bd786a9ac2c3..4c9f35d983b2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -21,6 +21,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_sysfs.h"
@@ -58,6 +59,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_DROP_WRITES,
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
+ XFS_RANDOM_BUF_LRU_REF,
};
struct xfs_errortag_attr {
@@ -163,6 +165,7 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
+XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -196,10 +199,11 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(drop_writes),
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
+ XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
NULL,
};
-struct kobj_type xfs_errortag_ktype = {
+static struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_errortag_sysfs_ops,
.default_attrs = xfs_errortag_attrs,
@@ -347,7 +351,7 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
__return_address, bp->b_ops->name, bp->b_bn);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 7c4bef3bddb7..ea816c1bf8db 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -63,87 +63,6 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
} \
}
-/*
- * error injection tags - the labels can be anything you want
- * but each tag should have its own unique number
- */
-
-#define XFS_ERRTAG_NOERROR 0
-#define XFS_ERRTAG_IFLUSH_1 1
-#define XFS_ERRTAG_IFLUSH_2 2
-#define XFS_ERRTAG_IFLUSH_3 3
-#define XFS_ERRTAG_IFLUSH_4 4
-#define XFS_ERRTAG_IFLUSH_5 5
-#define XFS_ERRTAG_IFLUSH_6 6
-#define XFS_ERRTAG_DA_READ_BUF 7
-#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
-#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
-#define XFS_ERRTAG_ALLOC_READ_AGF 10
-#define XFS_ERRTAG_IALLOC_READ_AGI 11
-#define XFS_ERRTAG_ITOBP_INOTOBP 12
-#define XFS_ERRTAG_IUNLINK 13
-#define XFS_ERRTAG_IUNLINK_REMOVE 14
-#define XFS_ERRTAG_DIR_INO_VALIDATE 15
-#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
-#define XFS_ERRTAG_IODONE_IOERR 17
-#define XFS_ERRTAG_STRATREAD_IOERR 18
-#define XFS_ERRTAG_STRATCMPL_IOERR 19
-#define XFS_ERRTAG_DIOWRITE_IOERR 20
-#define XFS_ERRTAG_BMAPIFORMAT 21
-#define XFS_ERRTAG_FREE_EXTENT 22
-#define XFS_ERRTAG_RMAP_FINISH_ONE 23
-#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24
-#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
-#define XFS_ERRTAG_BMAP_FINISH_ONE 26
-#define XFS_ERRTAG_AG_RESV_CRITICAL 27
-/*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
- */
-#define XFS_ERRTAG_DROP_WRITES 28
-#define XFS_ERRTAG_LOG_BAD_CRC 29
-#define XFS_ERRTAG_LOG_ITEM_PIN 30
-#define XFS_ERRTAG_MAX 31
-
-/*
- * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
- */
-#define XFS_RANDOM_DEFAULT 100
-#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
-#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_FREE_EXTENT 1
-#define XFS_RANDOM_RMAP_FINISH_ONE 1
-#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
-#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
-#define XFS_RANDOM_BMAP_FINISH_ONE 1
-#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_DROP_WRITES 1
-#define XFS_RANDOM_LOG_BAD_CRC 1
-#define XFS_RANDOM_LOG_ITEM_PIN 1
-
#ifdef DEBUG
extern int xfs_errortag_init(struct xfs_mount *mp);
extern void xfs_errortag_del(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ebdd0bd2b261..8601275cc5e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
#include <linux/falloc.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
+#include <linux/mman.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -58,7 +59,7 @@ xfs_zero_range(
xfs_off_t count,
bool *did_zero)
{
- return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
+ return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops);
}
int
@@ -237,11 +238,13 @@ xfs_file_dax_read(
if (!count)
return 0; /* skip atime */
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
return -EAGAIN;
+ } else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
+
ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -259,9 +262,10 @@ xfs_file_buffered_aio_read(
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
return -EAGAIN;
+ } else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
ret = generic_file_read_iter(iocb, to);
@@ -377,8 +381,6 @@ restart:
*/
spin_lock(&ip->i_flags_lock);
if (iocb->ki_pos > i_size_read(inode)) {
- bool zero = false;
-
spin_unlock(&ip->i_flags_lock);
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
@@ -399,7 +401,7 @@ restart:
drained_dio = true;
goto restart;
}
- error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
+ error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL);
if (error)
return error;
} else
@@ -436,7 +438,6 @@ xfs_dio_write_end_io(
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
loff_t offset = iocb->ki_pos;
- bool update_size = false;
int error = 0;
trace_xfs_end_io_direct_write(ip, offset, size);
@@ -447,6 +448,21 @@ xfs_dio_write_end_io(
if (size <= 0)
return size;
+ if (flags & IOMAP_DIO_COW) {
+ error = xfs_reflink_end_cow(ip, offset, size);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Unwritten conversion updates the in-core isize after extent
+ * conversion but before updating the on-disk size. Updating isize any
+ * earlier allows a racing dio read to find unwritten extents before
+ * they are converted.
+ */
+ if (flags & IOMAP_DIO_UNWRITTEN)
+ return xfs_iomap_write_unwritten(ip, offset, size, true);
+
/*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
@@ -461,20 +477,11 @@ xfs_dio_write_end_io(
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode)) {
i_size_write(inode, offset + size);
- update_size = true;
- }
- spin_unlock(&ip->i_flags_lock);
-
- if (flags & IOMAP_DIO_COW) {
- error = xfs_reflink_end_cow(ip, offset, size);
- if (error)
- return error;
- }
-
- if (flags & IOMAP_DIO_UNWRITTEN)
- error = xfs_iomap_write_unwritten(ip, offset, size);
- else if (update_size)
+ spin_unlock(&ip->i_flags_lock);
error = xfs_setfilesize(ip, offset, size);
+ } else {
+ spin_unlock(&ip->i_flags_lock);
+ }
return error;
}
@@ -549,9 +556,10 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- if (!xfs_ilock_nowait(ip, iolock)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, iolock))
return -EAGAIN;
+ } else {
xfs_ilock(ip, iolock);
}
@@ -603,9 +611,10 @@ xfs_file_dax_write(
size_t count;
loff_t pos;
- if (!xfs_ilock_nowait(ip, iolock)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, iolock))
return -EAGAIN;
+ } else {
xfs_ilock(ip, iolock);
}
@@ -761,7 +770,7 @@ xfs_file_fallocate(
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
loff_t new_size = 0;
- bool do_file_insert = 0;
+ bool do_file_insert = false;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
@@ -822,7 +831,7 @@ xfs_file_fallocate(
error = -EINVAL;
goto out_unlock;
}
- do_file_insert = 1;
+ do_file_insert = true;
} else {
flags |= XFS_PREALLOC_SET;
@@ -976,7 +985,7 @@ xfs_file_readdir(
* point we can change the ->readdir prototype to include the
* buffer size. For now we use the current glibc buffer size.
*/
- bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
return xfs_readdir(NULL, ip, ctx, bufsize);
}
@@ -1037,7 +1046,11 @@ __xfs_filemap_fault(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+ pfn_t pfn;
+
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else {
if (write_fault)
ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1082,37 +1095,16 @@ xfs_filemap_page_mkwrite(
}
/*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
*/
static int
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct xfs_inode *ip = XFS_I(inode);
- int ret = VM_FAULT_NOPAGE;
- loff_t size;
-
- trace_xfs_filemap_pfn_mkwrite(ip);
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
-
- /* check if the faulting page hasn't raced with truncate */
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else if (IS_DAX(inode))
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
- return ret;
-
+ return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1128,6 +1120,13 @@ xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
+ /*
+ * We don't support synchronous mappings for non-DAX files. At least
+ * until someone comes with a sensible use case.
+ */
+ if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+ return -EOPNOTSUPP;
+
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
@@ -1146,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.mmap = xfs_file_mmap,
+ .mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 814ed729881d..43cfc07996a4 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -367,29 +367,6 @@ xfs_getfsmap_datadev_helper(
return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
}
-/* Transform a rtbitmap "record" into a fsmap */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_helper(
- struct xfs_trans *tp,
- struct xfs_rtalloc_rec *rec,
- void *priv)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_getfsmap_info *info = priv;
- struct xfs_rmap_irec irec;
- xfs_daddr_t rec_daddr;
-
- rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-
- irec.rm_startblock = rec->ar_startblock;
- irec.rm_blockcount = rec->ar_blockcount;
- irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
- irec.rm_offset = 0;
- irec.rm_flags = 0;
-
- return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
-}
-
/* Transform a bnobt irec into a fsmap */
STATIC int
xfs_getfsmap_datadev_bnobt_helper(
@@ -475,6 +452,30 @@ xfs_getfsmap_logdev(
return xfs_getfsmap_helper(tp, info, &rmap, 0);
}
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
/* Execute a getfsmap query against the realtime device. */
STATIC int
__xfs_getfsmap_rtdev(
@@ -561,6 +562,7 @@ xfs_getfsmap_rtdev_rtbitmap(
return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
info);
}
+#endif /* CONFIG_XFS_RT */
/* Execute a getfsmap query against the regular data device. */
STATIC int
@@ -795,7 +797,15 @@ xfs_getfsmap_check_keys(
return false;
}
+/*
+ * There are only two devices if we didn't configure RT devices at build time.
+ */
+#ifdef CONFIG_XFS_RT
#define XFS_GETFSMAP_DEVS 3
+#else
+#define XFS_GETFSMAP_DEVS 2
+#endif /* CONFIG_XFS_RT */
+
/*
* Get filesystem's extents as described in head, and format for
* output. Calls formatter to fill the user's buffer until all
@@ -853,10 +863,12 @@ xfs_getfsmap(
handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
handlers[1].fn = xfs_getfsmap_logdev;
}
+#ifdef CONFIG_XFS_RT
if (mp->m_rtdev_targp) {
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
}
+#endif /* CONFIG_XFS_RT */
xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
xfs_getfsmap_dev_compare);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 34227115a5d6..43005fbe8b1e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -610,7 +610,7 @@ again:
} else {
rcu_read_unlock();
if (flags & XFS_IGET_INCORE) {
- error = -ENOENT;
+ error = -ENODATA;
goto out_error_or_again;
}
XFS_STATS_INC(mp, xs_ig_missed);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5599dda4727a..61d1cb7dc10d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -39,6 +39,7 @@
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_filestream.h"
@@ -384,14 +385,6 @@ xfs_isilocked(
}
#endif
-#ifdef DEBUG
-int xfs_locked_n;
-int xfs_small_retries;
-int xfs_middle_retries;
-int xfs_lots_retries;
-int xfs_lock_delays;
-#endif
-
/*
* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
* DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
@@ -544,24 +537,11 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
- xfs_lock_delays++;
-#endif
}
i = 0;
try_lock = 0;
goto again;
}
-
-#ifdef DEBUG
- if (attempts) {
- if (attempts < 5) xfs_small_retries++;
- else if (attempts < 100) xfs_middle_retries++;
- else xfs_lots_retries++;
- } else {
- xfs_locked_n++;
- }
-#endif
}
/*
@@ -767,7 +747,7 @@ xfs_ialloc(
xfs_inode_t *pip,
umode_t mode,
xfs_nlink_t nlink,
- xfs_dev_t rdev,
+ dev_t rdev,
prid_t prid,
int okalloc,
xfs_buf_t **ialloc_context,
@@ -819,6 +799,7 @@ xfs_ialloc(
set_nlink(inode, nlink);
ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+ inode->i_rdev = rdev;
xfs_set_projid(ip, prid);
if (pip && XFS_INHERIT_GID(pip)) {
@@ -867,7 +848,6 @@ xfs_ialloc(
case S_IFBLK:
case S_IFSOCK:
ip->i_d.di_format = XFS_DINODE_FMT_DEV;
- ip->i_df.if_u2.if_rdev = rdev;
ip->i_df.if_flags = 0;
flags |= XFS_ILOG_DEV;
break;
@@ -933,7 +913,7 @@ xfs_ialloc(
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_flags = XFS_IFEXTENTS;
ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
- ip->i_df.if_u1.if_extents = NULL;
+ ip->i_df.if_u1.if_root = NULL;
break;
default:
ASSERT(0);
@@ -975,7 +955,7 @@ xfs_dir_ialloc(
the inode. */
umode_t mode,
xfs_nlink_t nlink,
- xfs_dev_t rdev,
+ dev_t rdev,
prid_t prid, /* project id */
int okalloc, /* ok to allocate new space */
xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -1147,7 +1127,7 @@ xfs_create(
xfs_inode_t *dp,
struct xfs_name *name,
umode_t mode,
- xfs_dev_t rdev,
+ dev_t rdev,
xfs_inode_t **ipp)
{
int is_dir = S_ISDIR(mode);
@@ -1183,7 +1163,6 @@ xfs_create(
return error;
if (is_dir) {
- rdev = 0;
resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
tres = &M_RES(mp)->tr_mkdir;
} else {
@@ -1624,10 +1603,12 @@ xfs_itruncate_extents(
goto out;
/*
- * Clear the reflink flag if we truncated everything.
+ * Clear the reflink flag if there are no data fork blocks and
+ * there are no extents staged in the cow fork.
*/
- if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) {
+ if (ip->i_d.di_nblocks == 0)
+ ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
}
@@ -2376,6 +2357,7 @@ retry:
*/
if (ip->i_ino != inum + i) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ rcu_read_unlock();
continue;
}
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0ee453de239a..cc13c3763721 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -391,7 +391,7 @@ void xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
- umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+ umode_t mode, dev_t rdev, struct xfs_inode **ipp);
int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
umode_t mode, struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
@@ -428,7 +428,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
- xfs_nlink_t, xfs_dev_t, prid_t, int,
+ xfs_nlink_t, dev_t, prid_t, int,
struct xfs_inode **, int *);
/* from xfs_file.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6d0f74ec31e8..6ee5c3bf19ad 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -72,7 +72,6 @@ xfs_inode_item_data_fork_size(
break;
case XFS_DINODE_FMT_DEV:
- case XFS_DINODE_FMT_UUID:
break;
default:
ASSERT(0);
@@ -156,15 +155,13 @@ xfs_inode_item_format_data_fork(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_d.di_nextents > 0 &&
ip->i_df.if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(ip->i_df.if_u1.if_extents != NULL);
ASSERT(xfs_iext_count(&ip->i_df) > 0);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
@@ -181,8 +178,7 @@ xfs_inode_item_format_data_fork(
break;
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
@@ -200,8 +196,7 @@ xfs_inode_item_format_data_fork(
break;
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
- ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
- XFS_ILOG_DEV | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
/*
@@ -224,17 +219,9 @@ xfs_inode_item_format_data_fork(
break;
case XFS_DINODE_FMT_DEV:
iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_UUID);
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
if (iip->ili_fields & XFS_ILOG_DEV)
- ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
- break;
- case XFS_DINODE_FMT_UUID:
- iip->ili_fields &=
- ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
- XFS_ILOG_DEXT | XFS_ILOG_DEV);
- if (iip->ili_fields & XFS_ILOG_UUID)
- ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
+ ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
break;
default:
ASSERT(0);
@@ -264,7 +251,6 @@ xfs_inode_item_format_attr_fork(
ASSERT(xfs_iext_count(ip->i_afp) ==
ip->i_d.di_anextents);
- ASSERT(ip->i_afp->if_u1.if_extents != NULL);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -364,6 +350,9 @@ xfs_inode_to_log_dinode(
to->di_dmstate = from->di_dmstate;
to->di_flags = from->di_flags;
+ /* log a dummy value to ensure log structure is fully initialised */
+ to->di_next_unlinked = NULLAGINO;
+
if (from->di_version == 3) {
to->di_changecount = inode->i_version;
to->di_crtime.t_sec = from->di_crtime.t_sec;
@@ -404,6 +393,11 @@ xfs_inode_item_format_core(
* the second with the on-disk inode structure, and a possible third and/or
* fourth with the inode data/extents/b-tree root and inode attributes
* data/extents/b-tree root.
+ *
+ * Note: Always use the 64 bit inode log format structure so we don't
+ * leave an uninitialised hole in the format item on 64 bit systems. Log
+ * recovery on 32 bit systems handles this just fine, so there's no reason
+ * for not using an initialising the properly padded structure all the time.
*/
STATIC void
xfs_inode_item_format(
@@ -412,8 +406,8 @@ xfs_inode_item_format(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- struct xfs_inode_log_format *ilf;
struct xfs_log_iovec *vecp = NULL;
+ struct xfs_inode_log_format *ilf;
ASSERT(ip->i_d.di_version > 1);
@@ -425,7 +419,17 @@ xfs_inode_item_format(
ilf->ilf_boffset = ip->i_imap.im_boffset;
ilf->ilf_fields = XFS_ILOG_CORE;
ilf->ilf_size = 2; /* format + core */
- xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+ /*
+ * make sure we don't leak uninitialised data into the log in the case
+ * when we don't log every field in the inode.
+ */
+ ilf->ilf_dsize = 0;
+ ilf->ilf_asize = 0;
+ ilf->ilf_pad = 0;
+ memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u));
+
+ xlog_finish_iovec(lv, vecp, sizeof(*ilf));
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
@@ -745,7 +749,7 @@ xfs_iflush_done(
*/
iip = INODE_ITEM(blip);
if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
- lip->li_flags & XFS_LI_FAILED)
+ (blip->li_flags & XFS_LI_FAILED))
need_ail++;
blip = next;
@@ -855,44 +859,28 @@ xfs_istale_done(
}
/*
- * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
- * (which can have different field alignments) to the native version
+ * convert an xfs_inode_log_format struct from the old 32 bit version
+ * (which can have different field alignments) to the native 64 bit version
*/
int
xfs_inode_item_format_convert(
- xfs_log_iovec_t *buf,
- xfs_inode_log_format_t *in_f)
+ struct xfs_log_iovec *buf,
+ struct xfs_inode_log_format *in_f)
{
- if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
- xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
-
- in_f->ilf_type = in_f32->ilf_type;
- in_f->ilf_size = in_f32->ilf_size;
- in_f->ilf_fields = in_f32->ilf_fields;
- in_f->ilf_asize = in_f32->ilf_asize;
- in_f->ilf_dsize = in_f32->ilf_dsize;
- in_f->ilf_ino = in_f32->ilf_ino;
- /* copy biggest field of ilf_u */
- uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f32->ilf_u.ilfu_uuid);
- in_f->ilf_blkno = in_f32->ilf_blkno;
- in_f->ilf_len = in_f32->ilf_len;
- in_f->ilf_boffset = in_f32->ilf_boffset;
- return 0;
- } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
- xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
-
- in_f->ilf_type = in_f64->ilf_type;
- in_f->ilf_size = in_f64->ilf_size;
- in_f->ilf_fields = in_f64->ilf_fields;
- in_f->ilf_asize = in_f64->ilf_asize;
- in_f->ilf_dsize = in_f64->ilf_dsize;
- in_f->ilf_ino = in_f64->ilf_ino;
- /* copy biggest field of ilf_u */
- uuid_copy(&in_f->ilf_u.ilfu_uuid, &in_f64->ilf_u.ilfu_uuid);
- in_f->ilf_blkno = in_f64->ilf_blkno;
- in_f->ilf_len = in_f64->ilf_len;
- in_f->ilf_boffset = in_f64->ilf_boffset;
- return 0;
- }
- return -EFSCORRUPTED;
+ struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+
+ if (buf->i_len != sizeof(*in_f32))
+ return -EFSCORRUPTED;
+
+ in_f->ilf_type = in_f32->ilf_type;
+ in_f->ilf_size = in_f32->ilf_size;
+ in_f->ilf_fields = in_f32->ilf_fields;
+ in_f->ilf_asize = in_f32->ilf_asize;
+ in_f->ilf_dsize = in_f32->ilf_dsize;
+ in_f->ilf_ino = in_f32->ilf_ino;
+ memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u));
+ in_f->ilf_blkno = in_f32->ilf_blkno;
+ in_f->ilf_len = in_f32->ilf_len;
+ in_f->ilf_boffset = in_f32->ilf_boffset;
+ return 0;
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 4c7722e325b3..b72373a33cd9 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -48,7 +48,7 @@ extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
extern void xfs_iflush_abort(struct xfs_inode *, bool);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
- xfs_inode_log_format_t *);
+ struct xfs_inode_log_format *);
extern struct kmem_zone *xfs_ili_zone;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5049e8ab6e30..20dc65fef6a4 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -44,6 +44,7 @@
#include "xfs_btree.h"
#include <linux/fsmap.h>
#include "xfs_fsmap.h"
+#include "scrub/xfs_scrub.h"
#include <linux/capability.h>
#include <linux/cred.h>
@@ -310,8 +311,8 @@ xfs_readlink_by_handle(
int
xfs_set_dmattrs(
xfs_inode_t *ip,
- u_int evmask,
- u_int16_t state)
+ uint evmask,
+ uint16_t state)
{
xfs_mount_t *mp = ip->i_mount;
xfs_trans_t *tp;
@@ -1088,6 +1089,7 @@ xfs_ioctl_setattr_dax_invalidate(
int *join_flags)
{
struct inode *inode = VFS_I(ip);
+ struct super_block *sb = inode->i_sb;
int error;
*join_flags = 0;
@@ -1100,7 +1102,7 @@ xfs_ioctl_setattr_dax_invalidate(
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+ if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
return -EINVAL;
}
@@ -1200,6 +1202,8 @@ out_unlock:
* 8. for non-realtime files, the extent size hint must be limited
* to half the AG size to avoid alignment extending the extent beyond the
* limits of the AG.
+ *
+ * Please keep this function in sync with xfs_scrub_inode_extsize.
*/
static int
xfs_ioctl_setattr_check_extsize(
@@ -1256,6 +1260,8 @@ xfs_ioctl_setattr_check_extsize(
* 5. Extent size must be a multiple of the appropriate block size.
* 6. The extent size hint must be limited to half the AG size to avoid
* alignment extending the extent beyond the limits of the AG.
+ *
+ * Please keep this function in sync with xfs_scrub_inode_cowextsize.
*/
static int
xfs_ioctl_setattr_check_cowextsize(
@@ -1539,17 +1545,26 @@ out_drop_write:
return error;
}
-STATIC int
-xfs_getbmap_format(void **ap, struct getbmapx *bmv)
+static bool
+xfs_getbmap_format(
+ struct kgetbmap *p,
+ struct getbmapx __user *u,
+ size_t recsize)
{
- struct getbmap __user *base = (struct getbmap __user *)*ap;
-
- /* copy only getbmap portion (not getbmapx) */
- if (copy_to_user(base, bmv, sizeof(struct getbmap)))
- return -EFAULT;
-
- *ap += sizeof(struct getbmap);
- return 0;
+ if (put_user(p->bmv_offset, &u->bmv_offset) ||
+ put_user(p->bmv_block, &u->bmv_block) ||
+ put_user(p->bmv_length, &u->bmv_length) ||
+ put_user(0, &u->bmv_count) ||
+ put_user(0, &u->bmv_entries))
+ return false;
+ if (recsize < sizeof(struct getbmapx))
+ return true;
+ if (put_user(0, &u->bmv_iflags) ||
+ put_user(p->bmv_oflags, &u->bmv_oflags) ||
+ put_user(0, &u->bmv_unused1) ||
+ put_user(0, &u->bmv_unused2))
+ return false;
+ return true;
}
STATIC int
@@ -1559,68 +1574,57 @@ xfs_ioc_getbmap(
void __user *arg)
{
struct getbmapx bmx = { 0 };
- int error;
-
- /* struct getbmap is a strict subset of struct getbmapx. */
- if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
- return -EFAULT;
+ struct kgetbmap *buf;
+ size_t recsize;
+ int error, i;
- if (bmx.bmv_count < 2)
+ switch (cmd) {
+ case XFS_IOC_GETBMAPA:
+ bmx.bmv_iflags = BMV_IF_ATTRFORK;
+ /*FALLTHRU*/
+ case XFS_IOC_GETBMAP:
+ if (file->f_mode & FMODE_NOCMTIME)
+ bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
+ /* struct getbmap is a strict subset of struct getbmapx. */
+ recsize = sizeof(struct getbmap);
+ break;
+ case XFS_IOC_GETBMAPX:
+ recsize = sizeof(struct getbmapx);
+ break;
+ default:
return -EINVAL;
+ }
- bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
- if (file->f_mode & FMODE_NOCMTIME)
- bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-
- error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
- (__force struct getbmap *)arg+1);
- if (error)
- return error;
-
- /* copy back header - only size of getbmap */
- if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
- return -EFAULT;
- return 0;
-}
-
-STATIC int
-xfs_getbmapx_format(void **ap, struct getbmapx *bmv)
-{
- struct getbmapx __user *base = (struct getbmapx __user *)*ap;
-
- if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
- return -EFAULT;
-
- *ap += sizeof(struct getbmapx);
- return 0;
-}
-
-STATIC int
-xfs_ioc_getbmapx(
- struct xfs_inode *ip,
- void __user *arg)
-{
- struct getbmapx bmx;
- int error;
-
- if (copy_from_user(&bmx, arg, sizeof(bmx)))
+ if (copy_from_user(&bmx, arg, recsize))
return -EFAULT;
if (bmx.bmv_count < 2)
return -EINVAL;
+ if (bmx.bmv_count > ULONG_MAX / recsize)
+ return -ENOMEM;
- if (bmx.bmv_iflags & (~BMV_IF_VALID))
- return -EINVAL;
+ buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0);
+ if (!buf)
+ return -ENOMEM;
- error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
- (__force struct getbmapx *)arg+1);
+ error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, buf);
if (error)
- return error;
+ goto out_free_buf;
- /* copy back header */
- if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
- return -EFAULT;
+ error = -EFAULT;
+ if (copy_to_user(arg, &bmx, recsize))
+ goto out_free_buf;
+ arg += recsize;
+
+ for (i = 0; i < bmx.bmv_entries; i++) {
+ if (!xfs_getbmap_format(buf + i, arg, recsize))
+ goto out_free_buf;
+ arg += recsize;
+ }
+ error = 0;
+out_free_buf:
+ kmem_free(buf);
return 0;
}
@@ -1702,6 +1706,30 @@ xfs_ioc_getfsmap(
return 0;
}
+STATIC int
+xfs_ioc_scrub_metadata(
+ struct xfs_inode *ip,
+ void __user *arg)
+{
+ struct xfs_scrub_metadata scrub;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&scrub, arg, sizeof(scrub)))
+ return -EFAULT;
+
+ error = xfs_scrub_metadata(ip, &scrub);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &scrub, sizeof(scrub)))
+ return -EFAULT;
+
+ return 0;
+}
+
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -1877,14 +1905,15 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
- return xfs_ioc_getbmap(filp, cmd, arg);
-
case XFS_IOC_GETBMAPX:
- return xfs_ioc_getbmapx(ip, arg);
+ return xfs_ioc_getbmap(filp, cmd, arg);
case FS_IOC_GETFSMAP:
return xfs_ioc_getfsmap(ip, arg);
+ case XFS_IOC_SCRUB_METADATA:
+ return xfs_ioc_scrub_metadata(ip, arg);
+
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
case XFS_IOC_PATH_TO_FSHANDLE: {
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index e86c3ea137d2..8de879f0c7d5 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -86,7 +86,7 @@ xfs_file_compat_ioctl(
extern int
xfs_set_dmattrs(
struct xfs_inode *ip,
- u_int evmask,
- u_int16_t state);
+ uint evmask,
+ uint16_t state);
#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index fa0bc4d46065..35c79e246fde 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -556,6 +556,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_ERROR_INJECTION:
case XFS_IOC_ERROR_CLEARALL:
case FS_IOC_GETFSMAP:
+ case XFS_IOC_SCRUB_METADATA:
return xfs_file_ioctl(filp, cmd, p);
#ifndef BROKEN_X86_ALIGNMENT
/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a1909bc064e9..33eb4fb2e3fd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -30,9 +30,11 @@
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
@@ -54,13 +56,13 @@ xfs_bmbt_to_iomap(
struct xfs_mount *mp = ip->i_mount;
if (imap->br_startblock == HOLESTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else if (imap->br_startblock == DELAYSTARTBLOCK) {
- iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
- iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+ iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
if (imap->br_state == XFS_EXT_UNWRITTEN)
iomap->type = IOMAP_UNWRITTEN;
else
@@ -389,7 +391,7 @@ xfs_iomap_prealloc_size(
struct xfs_inode *ip,
loff_t offset,
loff_t count,
- xfs_extnum_t idx)
+ struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
@@ -414,7 +416,7 @@ xfs_iomap_prealloc_size(
*/
if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
- !xfs_iext_get_extent(ifp, idx - 1, &prev) ||
+ !xfs_iext_peek_prev_extent(ifp, icur, &prev) ||
prev.br_startoff + prev.br_blockcount < offset_fsb)
return mp->m_writeio_blocks;
@@ -532,7 +534,7 @@ xfs_file_iomap_begin_delay(
xfs_fileoff_t end_fsb;
int error = 0, eof = 0;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
xfs_fsblock_t prealloc_blocks = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
@@ -557,7 +559,7 @@ xfs_file_iomap_begin_delay(
goto out_unlock;
}
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
+ eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
if (!eof && got.br_startoff <= offset_fsb) {
if (xfs_is_reflink_inode(ip)) {
bool shared;
@@ -591,7 +593,8 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
+ &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -613,7 +616,8 @@ xfs_file_iomap_begin_delay(
retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
+ end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
+ eof);
switch (error) {
case 0:
break;
@@ -829,7 +833,8 @@ int
xfs_iomap_write_unwritten(
xfs_inode_t *ip,
xfs_off_t offset,
- xfs_off_t count)
+ xfs_off_t count,
+ bool update_isize)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -840,6 +845,7 @@ xfs_iomap_write_unwritten(
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
struct xfs_defer_ops dfops;
+ struct inode *inode = VFS_I(ip);
xfs_fsize_t i_size;
uint resblks;
int error;
@@ -899,7 +905,8 @@ xfs_iomap_write_unwritten(
i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
if (i_size > offset + count)
i_size = offset + count;
-
+ if (update_isize && i_size > i_size_read(inode))
+ i_size_write(inode, i_size);
i_size = xfs_new_eof(ip, i_size);
if (i_size) {
ip->i_d.di_size = i_size;
@@ -1083,6 +1090,10 @@ xfs_file_iomap_begin(
trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
+ if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+ & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+
xfs_bmbt_to_iomap(ip, iomap, &imap);
if (shared)
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 00db3ecea084..ee535065c5d0 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 17081c77ef86..56475fcd76f2 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -160,7 +160,6 @@ xfs_generic_create(
if (S_ISCHR(mode) || S_ISBLK(mode)) {
if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
return -EINVAL;
- rdev = sysv_encode_dev(rdev);
} else {
rdev = 0;
}
@@ -535,8 +534,7 @@ xfs_vn_getattr(
case S_IFBLK:
case S_IFCHR:
stat->blksize = BLKDEV_IOSIZE;
- stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
- sysv_minor(ip->i_df.if_u2.if_rdev));
+ stat->rdev = inode->i_rdev;
break;
default:
if (XFS_IS_REALTIME_INODE(ip)) {
@@ -886,22 +884,6 @@ xfs_setattr_size(
return error;
/*
- * We are going to log the inode size change in this transaction so
- * any previous writes that are beyond the on disk EOF and the new
- * EOF that have not been written out need to be written here. If we
- * do not write the data out, we expose ourselves to the null files
- * problem. Note that this includes any block zeroing we did above;
- * otherwise those blocks may not be zeroed after a crash.
- */
- if (did_zeroing ||
- (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- ip->i_d.di_size, newsize);
- if (error)
- return error;
- }
-
- /*
* We've already locked out new page faults, so now we can safely remove
* pages from the page cache knowing they won't get refaulted until we
* drop the XFS_MMAP_EXCL lock after the extent manipulations are
@@ -917,9 +899,29 @@ xfs_setattr_size(
* user visible changes). There's not much we can do about this, except
* to hope that the caller sees ENOMEM and retries the truncate
* operation.
+ *
+ * And we update in-core i_size and truncate page cache beyond newsize
+ * before writeback the [di_size, newsize] range, so we're guaranteed
+ * not to write stale data past the new EOF on truncate down.
*/
truncate_setsize(inode, newsize);
+ /*
+ * We are going to log the inode size change in this transaction so
+ * any previous writes that are beyond the on disk EOF and the new
+ * EOF that have not been written out need to be written here. If we
+ * do not write the data out, we expose ourselves to the null files
+ * problem. Note that this includes any block zeroing we did above;
+ * otherwise those blocks may not be zeroed after a crash.
+ */
+ if (did_zeroing ||
+ (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
+ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ip->i_d.di_size, newsize - 1);
+ if (error)
+ return error;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
return error;
@@ -1231,18 +1233,6 @@ xfs_setup_inode(
inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
- switch (inode->i_mode & S_IFMT) {
- case S_IFBLK:
- case S_IFCHR:
- inode->i_rdev =
- MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
- sysv_minor(ip->i_df.if_u2.if_rdev));
- break;
- default:
- inode->i_rdev = 0;
- break;
- }
-
i_size_write(inode, ip->i_d.di_size);
xfs_diflags_to_iflags(inode, ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c393a2f6d8c3..d58310514423 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -31,16 +31,6 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
-int
-xfs_internal_inum(
- xfs_mount_t *mp,
- xfs_ino_t ino)
-{
- return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
- (xfs_sb_version_hasquota(&mp->m_sb) &&
- xfs_is_quota_inode(&mp->m_sb, ino)));
-}
-
/*
* Return stat information for one inode.
* Return 0 if ok, else errno.
@@ -119,12 +109,11 @@ xfs_bulkstat_one_int(
switch (dic->di_format) {
case XFS_DINODE_FMT_DEV:
- buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+ buf->bs_rdev = sysv_encode_dev(inode->i_rdev);
buf->bs_blksize = BLKDEV_IOSIZE;
buf->bs_blocks = 0;
break;
case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_UUID:
buf->bs_rdev = 0;
buf->bs_blksize = mp->m_sb.sb_blocksize;
buf->bs_blocks = 0;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 17e86e0541af..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -96,6 +96,4 @@ xfs_inumbers(
void __user *buffer, /* buffer with inode info */
inumbers_fmt_pf formatter);
-int xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
-
#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index dcd1292664b3..6282bfc1afa9 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -142,6 +142,13 @@ typedef __u32 xfs_nlink_t;
#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
+/*
+ * Return the address of a label. Use barrier() so that the optimizer
+ * won't reorder code to refactor the error jumpouts into a single
+ * return, which throws off the reported address.
+ */
+#define __this_address ({ __label__ __here; __here: barrier(); &&__here; })
+
#define XFS_PROJID_DEFAULT 0
#define MIN(a,b) (min(a,b))
@@ -243,10 +250,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC noinline
-#endif
-
#else /* !DEBUG */
#ifdef XFS_WARN
@@ -254,21 +257,15 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
#define ASSERT(expr) \
(likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-
#else /* !DEBUG && !XFS_WARN */
#define ASSERT(expr) ((void)0)
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-
#endif /* XFS_WARN */
#endif /* DEBUG */
+#define STATIC static noinline
+
#ifdef CONFIG_XFS_RT
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c5107c7bc4bf..38d4227895ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -22,6 +22,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -608,6 +609,7 @@ xfs_log_mount(
xfs_daddr_t blk_offset,
int num_bblks)
{
+ bool fatal = xfs_sb_version_hascrc(&mp->m_sb);
int error = 0;
int min_logfsbs;
@@ -659,9 +661,20 @@ xfs_log_mount(
XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
XFS_MAX_LOG_BYTES);
error = -EINVAL;
+ } else if (mp->m_sb.sb_logsunit > 1 &&
+ mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) {
+ xfs_warn(mp,
+ "log stripe unit %u bytes must be a multiple of block size",
+ mp->m_sb.sb_logsunit);
+ error = -EINVAL;
+ fatal = true;
}
if (error) {
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ /*
+ * Log check errors are always fatal on v5; or whenever bad
+ * metadata leads to a crash.
+ */
+ if (fatal) {
xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
ASSERT(0);
goto out_free_log;
@@ -744,6 +757,7 @@ xfs_log_mount_finish(
{
int error = 0;
bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+ bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED;
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -780,6 +794,21 @@ xfs_log_mount_finish(
mp->m_super->s_flags &= ~MS_ACTIVE;
evict_inodes(mp->m_super);
+ /*
+ * Drain the buffer LRU after log recovery. This is required for v4
+ * filesystems to avoid leaving around buffers with NULL verifier ops,
+ * but we do it unconditionally to make sure we're always in a clean
+ * cache state after mount.
+ *
+ * Don't push in the error case because the AIL may have pending intents
+ * that aren't removed until recovery is cancelled.
+ */
+ if (!error && recovered) {
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_ail_push_all_sync(mp->m_ail);
+ }
+ xfs_wait_buftarg(mp->m_ddev_targp);
+
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -2515,7 +2544,7 @@ next_lv:
if (lv)
vecp = lv->lv_iovecp;
}
- if (record_cnt == 0 && ordered == false) {
+ if (record_cnt == 0 && !ordered) {
if (!lv)
return 0;
break;
@@ -3734,7 +3763,7 @@ xlog_ticket_alloc(
* one of the iclogs. This uses backup pointers stored in a different
* part of the log in case we trash the log structure.
*/
-void
+STATIC void
xlog_verify_dest_ptr(
struct xlog *log,
void *ptr)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 51bf7b827387..129975970d99 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -592,9 +592,9 @@ xlog_valid_lsn(
* a transiently forward state. Instead, we can see the LSN in a
* transiently behind state if we happen to race with a cycle wrap.
*/
- cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ cur_cycle = READ_ONCE(log->l_curr_cycle);
smp_rmb();
- cur_block = ACCESS_ONCE(log->l_curr_block);
+ cur_block = READ_ONCE(log->l_curr_block);
if ((CYCLE_LSN(lsn) > cur_cycle) ||
(CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index ee34899396b2..87b1c331f9eb 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -85,17 +85,21 @@ struct xfs_buf_cancel {
*/
/*
- * Verify the given count of basic blocks is valid number of blocks
- * to specify for an operation involving the given XFS log buffer.
- * Returns nonzero if the count is valid, 0 otherwise.
+ * Verify the log-relative block number and length in basic blocks are valid for
+ * an operation involving the given XFS log buffer. Returns true if the fields
+ * are valid, false otherwise.
*/
-
-static inline int
-xlog_buf_bbcount_valid(
+static inline bool
+xlog_verify_bp(
struct xlog *log,
+ xfs_daddr_t blk_no,
int bbcount)
{
- return bbcount > 0 && bbcount <= log->l_logBBsize;
+ if (blk_no < 0 || blk_no >= log->l_logBBsize)
+ return false;
+ if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
+ return false;
+ return true;
}
/*
@@ -110,7 +114,11 @@ xlog_get_bp(
{
struct xfs_buf *bp;
- if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ /*
+ * Pass log block 0 since we don't have an addr yet, buffer will be
+ * verified on read.
+ */
+ if (!xlog_verify_bp(log, 0, nbblks)) {
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
@@ -180,9 +188,10 @@ xlog_bread_noalign(
{
int error;
- if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
- nbblks);
+ if (!xlog_verify_bp(log, blk_no, nbblks)) {
+ xfs_warn(log->l_mp,
+ "Invalid log block/length (0x%llx, 0x%x) for buffer",
+ blk_no, nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return -EFSCORRUPTED;
}
@@ -265,9 +274,10 @@ xlog_bwrite(
{
int error;
- if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
- nbblks);
+ if (!xlog_verify_bp(log, blk_no, nbblks)) {
+ xfs_warn(log->l_mp,
+ "Invalid log block/length (0x%llx, 0x%x) for buffer",
+ blk_no, nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return -EFSCORRUPTED;
}
@@ -753,7 +763,7 @@ xlog_find_head(
* in the in-core log. The following number can be made tighter if
* we actually look at the block size of the filesystem.
*/
- num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+ num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
if (head_blk >= num_scan_bblks) {
/*
* We are guaranteed that the entire check can be performed
@@ -2975,7 +2985,7 @@ xlog_recover_inode_pass2(
struct xlog_recover_item *item,
xfs_lsn_t current_lsn)
{
- xfs_inode_log_format_t *in_f;
+ struct xfs_inode_log_format *in_f;
xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
xfs_dinode_t *dip;
@@ -2989,10 +2999,10 @@ xlog_recover_inode_pass2(
uint isize;
int need_free = 0;
- if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
+ in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -3163,16 +3173,8 @@ xlog_recover_inode_pass2(
}
fields = in_f->ilf_fields;
- switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
- case XFS_ILOG_DEV:
+ if (fields & XFS_ILOG_DEV)
xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
- break;
- case XFS_ILOG_UUID:
- memcpy(XFS_DFORK_DPTR(dip),
- &in_f->ilf_u.ilfu_uuid,
- sizeof(uuid_t));
- break;
- }
if (in_f->ilf_size == 2)
goto out_owner_change;
@@ -4297,7 +4299,7 @@ xlog_recover_add_to_trans(
char *dp,
int len)
{
- xfs_inode_log_format_t *in_f; /* any will do */
+ struct xfs_inode_log_format *in_f; /* any will do */
xlog_recover_item_t *item;
char *ptr;
@@ -4331,7 +4333,7 @@ xlog_recover_add_to_trans(
ptr = kmem_alloc(len, KM_SLEEP);
memcpy(ptr, dp, len);
- in_f = (xfs_inode_log_format_t *)ptr;
+ in_f = (struct xfs_inode_log_format *)ptr;
/* take the tail entry */
item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
@@ -5823,7 +5825,7 @@ xlog_recover_cancel(
* Read all of the agf and agi counters and check that they
* are consistent with the superblock counters.
*/
-void
+STATIC void
xlog_recover_check_summary(
struct xlog *log)
{
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 85401155750e..34447dca97d1 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __XFS_MESSAGE_H
#define __XFS_MESSAGE_H 1
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ea7d4b4e50d0..c879b517cc94 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -704,7 +704,7 @@ xfs_mountfs(
xfs_set_maxicount(mp);
/* enable fail_at_unmount as default */
- mp->m_fail_unmount = 1;
+ mp->m_fail_unmount = true;
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
if (error)
@@ -1022,10 +1022,21 @@ xfs_mountfs(
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
+ /*
+ * Cancel all delayed reclaim work and reclaim the inodes directly.
+ * We have to do this /after/ rtunmount and qm_unmount because those
+ * two will have scheduled delayed reclaim for the rt/quota inodes.
+ *
+ * This is slightly different from the unmountfs call sequence
+ * because we could be tearing down a partially set up mount. In
+ * particular, if log_mount_finish fails we bail out without calling
+ * qm_unmount_quotas and therefore rely on qm_unmount to release the
+ * quota inodes.
+ */
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 0c381d71b242..0492436a053f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -134,7 +134,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
- XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
}
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 2f2dc3c09ad0..aa6c5c193f45 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
@@ -274,7 +275,7 @@ xfs_fs_commit_blocks(
(end - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(error);
- error = xfs_iomap_write_unwritten(ip, start, length);
+ error = xfs_iomap_write_unwritten(ip, start, length, false);
if (error)
goto out_drop_iolock;
}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b587cb99b2b7..bf45951e28fe 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3246815c24d6..cc041a29eb70 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -273,7 +273,7 @@ xfs_reflink_reserve_cow(
struct xfs_bmbt_irec got;
int error = 0;
bool eof = false, trimmed;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -284,7 +284,7 @@ xfs_reflink_reserve_cow(
* tree.
*/
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
eof = true;
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
@@ -312,7 +312,7 @@ xfs_reflink_reserve_cow(
return error;
error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &idx, eof);
+ imap->br_blockcount, 0, &got, &icur, eof);
if (error == -ENOSPC || error == -EDQUOT)
trace_xfs_reflink_cow_enospc(ip, imap);
if (error)
@@ -353,29 +353,22 @@ xfs_reflink_convert_cow(
xfs_off_t offset,
xfs_off_t count)
{
- struct xfs_bmbt_irec got;
- struct xfs_defer_ops dfops;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_extnum_t idx;
- bool found;
- int error = 0;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ struct xfs_bmbt_irec imap;
+ struct xfs_defer_ops dfops;
+ xfs_fsblock_t first_block = NULLFSBLOCK;
+ int nimaps = 1, error = 0;
- /* Convert all the extents to real from unwritten. */
- for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
- found && got.br_startoff < end_fsb;
- found = xfs_iext_get_extent(ifp, ++idx, &got)) {
- error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
- end_fsb - offset_fsb, &dfops);
- if (error)
- break;
- }
+ ASSERT(count != 0);
- /* Finish up. */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
+ XFS_BMAPI_CONVERT_ONLY, &first_block, 0, &imap, &nimaps,
+ &dfops);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -399,7 +392,7 @@ xfs_reflink_allocate_cow(
bool trimmed;
xfs_filblks_t resaligned;
xfs_extlen_t resblks = 0;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
retry:
ASSERT(xfs_is_reflink_inode(ip));
@@ -409,7 +402,7 @@ retry:
* Even if the extent is not shared we might have a preallocation for
* it in the COW fork. If so use it.
*/
- if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) &&
+ if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
got.br_startoff <= offset_fsb) {
*shared = true;
@@ -496,13 +489,13 @@ xfs_reflink_find_cow_mapping(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
xfs_fileoff_t offset_fsb;
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(xfs_is_reflink_inode(ip));
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
- if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
return false;
if (got.br_startoff > offset_fsb)
return false;
@@ -524,18 +517,18 @@ xfs_reflink_trim_irec_to_next_cow(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
if (!xfs_is_reflink_inode(ip))
return;
/* Find the extent in the CoW fork. */
- if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got))
return;
/* This is the extent before; try sliding up one. */
if (got.br_startoff < offset_fsb) {
- if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+ if (!xfs_iext_next_extent(ifp, &icur, &got))
return;
}
@@ -562,24 +555,32 @@ xfs_reflink_cancel_cow_blocks(
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
int error = 0;
if (!xfs_is_reflink_inode(ip))
return 0;
- if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
return 0;
- while (got.br_startoff < end_fsb) {
+ /* Walk backwards until we're out of the I/O range... */
+ while (got.br_startoff + got.br_blockcount > offset_fsb) {
del = got;
xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+
+ /* Extent delete may have bumped ext forward */
+ if (!del.br_blockcount) {
+ xfs_iext_prev(ifp, &icur);
+ goto next_extent;
+ }
+
trace_xfs_reflink_cancel_cow(ip, &del);
if (isnullstartblock(del.br_startblock)) {
error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
- &idx, &got, &del);
+ &icur, &got, &del);
if (error)
break;
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
@@ -610,10 +611,10 @@ xfs_reflink_cancel_cow_blocks(
}
/* Remove the mapping from the CoW fork. */
- xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
}
-
- if (!xfs_iext_get_extent(ifp, ++idx, &got))
+next_extent:
+ if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -698,7 +699,7 @@ xfs_reflink_end_cow(
int error;
unsigned int resblks;
xfs_filblks_t rlen;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
trace_xfs_reflink_end_cow(ip, offset, count);
@@ -733,21 +734,22 @@ xfs_reflink_end_cow(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- /* If there is a hole at end_fsb - 1 go to the previous extent */
- if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
- got.br_startoff > end_fsb) {
- ASSERT(idx > 0);
- xfs_iext_get_extent(ifp, --idx, &got);
- }
+ /*
+ * In case of racing, overlapping AIO writes no COW extents might be
+ * left by the time I/O completes for the loser of the race. In that
+ * case we are done.
+ */
+ if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+ goto out_cancel;
/* Walk backwards until we're out of the I/O range... */
while (got.br_startoff + got.br_blockcount > offset_fsb) {
del = got;
xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
- /* Extent delete may have bumped idx forward */
+ /* Extent delete may have bumped ext forward */
if (!del.br_blockcount) {
- idx--;
+ xfs_iext_prev(ifp, &icur);
goto next_extent;
}
@@ -759,7 +761,7 @@ xfs_reflink_end_cow(
* allocated but have not yet been involved in a write.
*/
if (got.br_state == XFS_EXT_UNWRITTEN) {
- idx--;
+ xfs_iext_prev(ifp, &icur);
goto next_extent;
}
@@ -790,14 +792,14 @@ xfs_reflink_end_cow(
goto out_defer;
/* Remove the mapping from the CoW fork. */
- xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
xfs_defer_ijoin(&dfops, ip);
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_defer;
next_extent:
- if (!xfs_iext_get_extent(ifp, idx, &got))
+ if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -809,6 +811,7 @@ next_extent:
out_defer:
xfs_defer_cancel(&dfops);
+out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
@@ -1426,7 +1429,7 @@ xfs_reflink_inode_has_shared_extents(
xfs_extlen_t aglen;
xfs_agblock_t rbno;
xfs_extlen_t rlen;
- xfs_extnum_t idx;
+ struct xfs_iext_cursor icur;
bool found;
int error;
@@ -1438,7 +1441,7 @@ xfs_reflink_inode_has_shared_extents(
}
*has_shared = false;
- found = xfs_iext_lookup_extent(ip, ifp, 0, &idx, &got);
+ found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
@@ -1457,7 +1460,7 @@ xfs_reflink_inode_has_shared_extents(
return 0;
}
next:
- found = xfs_iext_get_extent(ifp, ++idx, &got);
+ found = xfs_iext_next_extent(ifp, &icur, &got);
}
return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 79defa722bf1..3f30f846d7f2 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -138,6 +138,7 @@ int xfs_rtalloc_query_range(struct xfs_trans *tp,
int xfs_rtalloc_query_all(struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
+bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
@@ -146,6 +147,7 @@ int xfs_rtalloc_query_all(struct xfs_trans *tp,
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
+# define xfs_verify_rtbno(m, r) (false)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c996f4ae4a5f..f663022353c0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1637,7 +1637,7 @@ xfs_fs_fill_super(
/* version 5 superblocks support inode version counters. */
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
- sb->s_flags |= MS_I_VERSION;
+ sb->s_flags |= SB_I_VERSION;
if (mp->m_flags & XFS_MOUNT_DAX) {
xfs_warn(mp,
@@ -1654,6 +1654,16 @@ xfs_fs_fill_super(
"DAX and reflink have not been tested together!");
}
+ if (mp->m_flags & XFS_MOUNT_DISCARD) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+
+ if (!blk_queue_discard(q)) {
+ xfs_warn(mp, "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ }
+ }
+
if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bb5514688d47..d718a10c2271 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -218,53 +218,15 @@ TRACE_EVENT(xfs_attr_list_node_descend,
__entry->bt_before)
);
-TRACE_EVENT(xfs_iext_insert,
- TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
- struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
- TP_ARGS(ip, idx, r, state, caller_ip),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_extnum_t, idx)
- __field(xfs_fileoff_t, startoff)
- __field(xfs_fsblock_t, startblock)
- __field(xfs_filblks_t, blockcount)
- __field(xfs_exntst_t, state)
- __field(int, bmap_state)
- __field(unsigned long, caller_ip)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->idx = idx;
- __entry->startoff = r->br_startoff;
- __entry->startblock = r->br_startblock;
- __entry->blockcount = r->br_blockcount;
- __entry->state = r->br_state;
- __entry->bmap_state = state;
- __entry->caller_ip = caller_ip;
- ),
- TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
- "offset %lld block %lld count %lld flag %d caller %ps",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
- (long)__entry->idx,
- __entry->startoff,
- (int64_t)__entry->startblock,
- __entry->blockcount,
- __entry->state,
- (char *)__entry->caller_ip)
-);
-
DECLARE_EVENT_CLASS(xfs_bmap_class,
- TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+ TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state,
unsigned long caller_ip),
- TP_ARGS(ip, idx, state, caller_ip),
+ TP_ARGS(ip, cur, state, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(xfs_extnum_t, idx)
+ __field(void *, leaf);
+ __field(int, pos);
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -277,10 +239,11 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
struct xfs_bmbt_irec r;
ifp = xfs_iext_state_to_fork(ip, state);
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+ xfs_iext_get_extent(ifp, cur, &r);
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->idx = idx;
+ __entry->leaf = cur->leaf;
+ __entry->pos = cur->pos;
__entry->startoff = r.br_startoff;
__entry->startblock = r.br_startblock;
__entry->blockcount = r.br_blockcount;
@@ -288,12 +251,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__entry->bmap_state = state;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+ TP_printk("dev %d:%d ino 0x%llx state %s cur 0x%p/%d "
"offset %lld block %lld count %lld flag %d caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
- (long)__entry->idx,
+ __entry->leaf,
+ __entry->pos,
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount,
@@ -303,13 +267,15 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
#define DEFINE_BMAP_EVENT(name) \
DEFINE_EVENT(xfs_bmap_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+ TP_PROTO(struct xfs_inode *ip, struct xfs_iext_cursor *cur, int state, \
unsigned long caller_ip), \
- TP_ARGS(ip, idx, state, caller_ip))
+ TP_ARGS(ip, cur, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_insert);
DEFINE_BMAP_EVENT(xfs_iext_remove);
DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
DEFINE_BMAP_EVENT(xfs_bmap_post_update);
-DEFINE_BMAP_EVENT(xfs_extlist);
+DEFINE_BMAP_EVENT(xfs_read_extent);
+DEFINE_BMAP_EVENT(xfs_write_extent);
DECLARE_EVENT_CLASS(xfs_buf_class,
TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
@@ -688,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 354368a906e5..cef89f7127d3 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -25,6 +25,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_trace.h"
+#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_log.h"
@@ -514,11 +515,26 @@ xfsaild(
current->flags |= PF_MEMALLOC;
set_freezable();
- while (!kthread_should_stop()) {
+ while (1) {
if (tout && tout <= 20)
- __set_current_state(TASK_KILLABLE);
+ set_current_state(TASK_KILLABLE);
else
- __set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /*
+ * Check kthread_should_stop() after we set the task state
+ * to guarantee that we either see the stop bit and exit or
+ * the task state is reset to runnable such that it's not
+ * scheduled out indefinitely and detects the stop bit at
+ * next iteration.
+ *
+ * A memory barrier is included in above task state set to
+ * serialize again kthread_stop().
+ */
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
spin_lock(&ailp->xa_lock);