aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/kmem.c15
-rw-r--r--fs/xfs/kmem.h9
-rw-r--r--fs/xfs/xfs_acl.c12
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_bmap_btree.c44
-rw-r--r--fs/xfs/xfs_bmap_btree.h4
-rw-r--r--fs/xfs/xfs_bmap_util.c69
-rw-r--r--fs/xfs/xfs_btree.c170
-rw-r--r--fs/xfs/xfs_btree.h19
-rw-r--r--fs/xfs/xfs_buf.c253
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c25
-rw-r--r--fs/xfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/xfs_dir2_format.h51
-rw-r--r--fs/xfs/xfs_dir2_leaf.c20
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_dir2_sf.c6
-rw-r--r--fs/xfs/xfs_dquot.c26
-rw-r--r--fs/xfs/xfs_dquot_item.c3
-rw-r--r--fs/xfs/xfs_extent_busy.c3
-rw-r--r--fs/xfs/xfs_fs.h2
-rw-r--r--fs/xfs/xfs_icache.c17
-rw-r--r--fs/xfs/xfs_icache.h6
-rw-r--r--fs/xfs/xfs_inode_buf.c10
-rw-r--r--fs/xfs/xfs_inode_buf.h18
-rw-r--r--fs/xfs/xfs_ioctl.c36
-rw-r--r--fs/xfs/xfs_ioctl32.c18
-rw-r--r--fs/xfs/xfs_itable.c7
-rw-r--r--fs/xfs/xfs_log.c3
-rw-r--r--fs/xfs/xfs_log_format.h8
-rw-r--r--fs/xfs/xfs_log_recover.c194
-rw-r--r--fs/xfs/xfs_qm.c287
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_super.c12
-rw-r--r--fs/xfs/xfs_symlink.c2
37 files changed, 880 insertions, 512 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 4a7286c1dc80..a02cfb9e3bce 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -27,8 +27,6 @@
/*
* Greedy allocation. May fail and may return vmalloced memory.
- *
- * Must be freed using kmem_free_large.
*/
void *
kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
@@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
void *ptr;
size_t kmsize = maxsize;
- while (!(ptr = kmem_zalloc_large(kmsize))) {
+ while (!(ptr = vzalloc(kmsize))) {
if ((kmsize >>= 1) <= minsize)
kmsize = minsize;
}
@@ -75,6 +73,17 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
return ptr;
}
+void *
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+{
+ void *ptr;
+
+ ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
+ if (ptr)
+ return ptr;
+ return vzalloc(size);
+}
+
void
kmem_free(const void *ptr)
{
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index b2f2620f9a87..3a7371cab508 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -57,17 +57,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
extern void *kmem_alloc(size_t, xfs_km_flags_t);
extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
extern void kmem_free(const void *);
-static inline void *kmem_zalloc_large(size_t size)
-{
- return vzalloc(size);
-}
-static inline void kmem_free_large(void *ptr)
-{
- vfree(ptr);
-}
extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 69518960b2ba..0e2f37efedd0 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
* go out to the disk.
*/
len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kzalloc(len, GFP_KERNEL);
+ xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
if (!xfs_acl)
return ERR_PTR(-ENOMEM);
@@ -175,10 +175,10 @@ xfs_get_acl(struct inode *inode, int type)
if (IS_ERR(acl))
goto out;
- out_update_cache:
+out_update_cache:
set_cached_acl(inode, type, acl);
- out:
- kfree(xfs_acl);
+out:
+ kmem_free(xfs_acl);
return acl;
}
@@ -209,7 +209,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
struct xfs_acl *xfs_acl;
int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kzalloc(len, GFP_KERNEL);
+ xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
if (!xfs_acl)
return -ENOMEM;
@@ -222,7 +222,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
len, ATTR_ROOT);
- kfree(xfs_acl);
+ kmem_free(xfs_acl);
} else {
/*
* A NULL ACL argument means we want to remove the ACL.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 977da0ec6604..e51e581454e9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1582,7 +1582,7 @@ xfs_vm_write_begin(
unlock_page(page);
if (pos + len > i_size_read(inode))
- truncate_pagecache(inode, pos + len, i_size_read(inode));
+ truncate_pagecache(inode, i_size_read(inode));
page_cache_release(page);
page = NULL;
@@ -1618,7 +1618,7 @@ xfs_vm_write_end(
loff_t to = pos + len;
if (to > isize) {
- truncate_pagecache(inode, to, isize);
+ truncate_pagecache(inode, isize);
xfs_vm_kill_delalloc_range(inode, isize, to);
}
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 92b830901d60..f47e65c30be6 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4450,7 +4450,7 @@ xfs_bmapi_write(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */
+ struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
int eof; /* after the end of extents */
int error; /* error return */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index cf3bc76710c3..bb8de8e399c4 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -925,3 +925,47 @@ xfs_bmdr_maxrecs(
return blocklen / sizeof(xfs_bmdr_rec_t);
return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
}
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_ino_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(tp || buffer_list);
+ ASSERT(!(tp && buffer_list));
+ if (whichfork == XFS_DATA_FORK)
+ ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ else
+ ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+ cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+ if (!cur)
+ return ENOMEM;
+
+ error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 1b726d626941..e367461a638e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,6 +236,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+ int whichfork, xfs_ino_t new_owner,
+ struct list_head *buffer_list);
+
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 541d59f5e658..97f952caea74 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -612,13 +612,9 @@ xfs_getbmap(
if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
return XFS_ERROR(ENOMEM);
- out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
- if (!out) {
- out = kmem_zalloc_large(bmv->bmv_count *
- sizeof(struct getbmapx));
- if (!out)
- return XFS_ERROR(ENOMEM);
- }
+ out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
+ if (!out)
+ return XFS_ERROR(ENOMEM);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
@@ -754,10 +750,7 @@ xfs_getbmap(
break;
}
- if (is_vmalloc_addr(out))
- kmem_free_large(out);
- else
- kmem_free(out);
+ kmem_free(out);
return error;
}
@@ -1789,14 +1782,6 @@ xfs_swap_extents(
int taforkblks = 0;
__uint64_t tmp;
- /*
- * We have no way of updating owner information in the BMBT blocks for
- * each inode on CRC enabled filesystems, so to avoid corrupting the
- * this metadata we simply don't allow extent swaps to occur.
- */
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return XFS_ERROR(EINVAL);
-
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
error = XFS_ERROR(ENOMEM);
@@ -1920,6 +1905,42 @@ xfs_swap_extents(
goto out_trans_cancel;
}
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ /*
+ * Before we've swapped the forks, lets set the owners of the forks
+ * appropriately. We have to do this as we are demand paging the btree
+ * buffers, and so the validation done on read will expect the owner
+ * field to be correctly set. Once we change the owners, we can swap the
+ * inode forks.
+ *
+ * Note the trickiness in setting the log flags - we set the owner log
+ * flag on the opposite inode (i.e. the inode we are setting the new
+ * owner to be) because once we swap the forks and log that, log
+ * recovery is going to see the fork as owned by the swapped inode,
+ * not the pre-swapped inodes.
+ */
+ src_log_flags = XFS_ILOG_CORE;
+ target_log_flags = XFS_ILOG_CORE;
+ if (ip->i_d.di_version == 3 &&
+ ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ target_log_flags |= XFS_ILOG_DOWNER;
+ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+ tip->i_ino, NULL);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ if (tip->i_d.di_version == 3 &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ src_log_flags |= XFS_ILOG_DOWNER;
+ error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+ ip->i_ino, NULL);
+ if (error)
+ goto out_trans_cancel;
+ }
+
/*
* Swap the data forks of the inodes
*/
@@ -1957,7 +1978,6 @@ xfs_swap_extents(
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
- src_log_flags = XFS_ILOG_CORE;
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
/* If the extents fit in the inode, fix the
@@ -1971,11 +1991,12 @@ xfs_swap_extents(
src_log_flags |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
+ ASSERT(ip->i_d.di_version < 3 ||
+ (src_log_flags & XFS_ILOG_DOWNER));
src_log_flags |= XFS_ILOG_DBROOT;
break;
}
- target_log_flags = XFS_ILOG_CORE;
switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
/* If the extents fit in the inode, fix the
@@ -1990,13 +2011,11 @@ xfs_swap_extents(
break;
case XFS_DINODE_FMT_BTREE:
target_log_flags |= XFS_ILOG_DBROOT;
+ ASSERT(tip->i_d.di_version < 3 ||
+ (target_log_flags & XFS_ILOG_DOWNER));
break;
}
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
xfs_trans_log_inode(tp, ip, src_log_flags);
xfs_trans_log_inode(tp, tip, target_log_flags);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7a2b4da3c0db..5690e102243d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -855,6 +855,41 @@ xfs_btree_readahead(
return xfs_btree_readahead_sblock(cur, lr, block);
}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+ return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ } else {
+ ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+ ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+ return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ be32_to_cpu(ptr->s));
+ }
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ xfs_extlen_t count)
+{
+ xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+ xfs_btree_ptr_to_daddr(cur, ptr),
+ cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
/*
* Set the buffer for level "lev" in the cursor to bp, releasing
* any previous buffer.
@@ -1073,24 +1108,6 @@ xfs_btree_buf_to_ptr(
}
}
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
- return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
- } else {
- ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
- ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
- return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(ptr->s));
- }
-}
-
STATIC void
xfs_btree_set_refs(
struct xfs_btree_cur *cur,
@@ -3869,3 +3886,120 @@ xfs_btree_get_rec(
*stat = 1;
return 0;
}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction. If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+ struct xfs_btree_cur *cur,
+ int level,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* modify the owner */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+ else
+ block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+ /*
+ * If the block is a root block hosted in an inode, we might not have a
+ * buffer pointer here and we shouldn't attempt to log the change as the
+ * information is already held in the inode and discarded when the root
+ * block is formatted into the on-disk inode fork. We still change it,
+ * though, so everything is consistent in memory.
+ */
+ if (bp) {
+ if (cur->bc_tp) {
+ xfs_trans_ordered_buf(cur->bc_tp, bp);
+ xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+ } else {
+ xfs_buf_delwri_queue(bp, buffer_list);
+ }
+ } else {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ }
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return ENOENT;
+
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+ struct xfs_btree_cur *cur,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ lptr = *ptr;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_block_change_owner(cur, level,
+ new_owner,
+ buffer_list);
+ } while (!error);
+
+ if (error != ENOENT)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index c8473c7ef45e..06729b67ad58 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -121,15 +121,18 @@ union xfs_btree_rec {
/*
* For logging record fields.
*/
-#define XFS_BB_MAGIC 0x01
-#define XFS_BB_LEVEL 0x02
-#define XFS_BB_NUMRECS 0x04
-#define XFS_BB_LEFTSIB 0x08
-#define XFS_BB_RIGHTSIB 0x10
-#define XFS_BB_BLKNO 0x20
+#define XFS_BB_MAGIC (1 << 0)
+#define XFS_BB_LEVEL (1 << 1)
+#define XFS_BB_NUMRECS (1 << 2)
+#define XFS_BB_LEFTSIB (1 << 3)
+#define XFS_BB_RIGHTSIB (1 << 4)
+#define XFS_BB_BLKNO (1 << 5)
+#define XFS_BB_LSN (1 << 6)
+#define XFS_BB_UUID (1 << 7)
+#define XFS_BB_OWNER (1 << 8)
#define XFS_BB_NUM_BITS 5
#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
-#define XFS_BB_NUM_BITS_CRC 8
+#define XFS_BB_NUM_BITS_CRC 9
#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
/*
@@ -442,6 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+ struct list_head *buffer_list);
/*
* btree block CRC helpers
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c06823fe10d3..263470075ea2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -81,54 +81,6 @@ xfs_buf_vmap_len(
}
/*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-STATIC void
-xfs_buf_lru_add(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
-
- spin_lock(&btp->bt_lru_lock);
- if (list_empty(&bp->b_lru)) {
- atomic_inc(&bp->b_hold);
- list_add_tail(&bp->b_lru, &btp->bt_lru);
- btp->bt_lru_nr++;
- bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
- }
- spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
- struct xfs_buf *bp)
-{
- struct xfs_buftarg *btp = bp->b_target;
-
- if (list_empty(&bp->b_lru))
- return;
-
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- }
- spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
@@ -151,20 +103,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- atomic_set(&(bp)->b_lru_ref, 0);
- if (!list_empty(&bp->b_lru)) {
- struct xfs_buftarg *btp = bp->b_target;
+ spin_lock(&bp->b_lock);
+ atomic_set(&bp->b_lru_ref, 0);
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+ (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ atomic_dec(&bp->b_hold);
- spin_lock(&btp->bt_lru_lock);
- if (!list_empty(&bp->b_lru) &&
- !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
- list_del_init(&bp->b_lru);
- btp->bt_lru_nr--;
- atomic_dec(&bp->b_hold);
- }
- spin_unlock(&btp->bt_lru_lock);
- }
ASSERT(atomic_read(&bp->b_hold) >= 1);
+ spin_unlock(&bp->b_lock);
}
static int
@@ -228,6 +174,7 @@ _xfs_buf_alloc(
INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
+ spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
bp->b_target = target;
bp->b_flags = flags;
@@ -917,12 +864,33 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- if (!(bp->b_flags & XBF_STALE) &&
- atomic_read(&bp->b_lru_ref)) {
- xfs_buf_lru_add(bp);
+ spin_lock(&bp->b_lock);
+ if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ /*
+ * If the buffer is added to the LRU take a new
+ * reference to the buffer for the LRU and clear the
+ * (now stale) dispose list state flag
+ */
+ if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ bp->b_state &= ~XFS_BSTATE_DISPOSE;
+ atomic_inc(&bp->b_hold);
+ }
+ spin_unlock(&bp->b_lock);
spin_unlock(&pag->pag_buf_lock);
} else {
- xfs_buf_lru_del(bp);
+ /*
+ * most of the time buffers will already be removed from
+ * the LRU, so optimise that case by checking for the
+ * XFS_BSTATE_DISPOSE flag indicating the last list the
+ * buffer was on was the disposal list
+ */
+ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+ list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ } else {
+ ASSERT(list_empty(&bp->b_lru));
+ }
+ spin_unlock(&bp->b_lock);
+
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
@@ -1502,83 +1470,121 @@ xfs_buf_iomove(
* returned. These buffers will have an elevated hold count, so wait on those
* while freeing all the buffers only held by the LRU.
*/
+static enum lru_status
+xfs_buftarg_wait_rele(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ if (atomic_read(&bp->b_hold) > 1) {
+ /* need to wait, so skip it this pass */
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ return LRU_SKIP;
+ }
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+
+ /*
+ * clear the LRU reference count so the buffer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+}
+
void
xfs_wait_buftarg(
struct xfs_buftarg *btp)
{
- struct xfs_buf *bp;
+ LIST_HEAD(dispose);
+ int loop = 0;
-restart:
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
- if (atomic_read(&bp->b_hold) > 1) {
- trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- spin_unlock(&btp->bt_lru_lock);
- delay(100);
- goto restart;
+ /* loop until there is nothing left on the lru list. */
+ while (list_lru_count(&btp->bt_lru)) {
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ &dispose, LONG_MAX);
+
+ while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ xfs_buf_rele(bp);
}
- /*
- * clear the LRU reference count so the buffer doesn't get
- * ignored in xfs_buf_rele().
- */
- atomic_set(&bp->b_lru_ref, 0);
- spin_unlock(&btp->bt_lru_lock);
- xfs_buf_rele(bp);
- spin_lock(&btp->bt_lru_lock);
+ if (loop++ != 0)
+ delay(100);
}
- spin_unlock(&btp->bt_lru_lock);
}
-int
-xfs_buftarg_shrink(
+static enum lru_status
+xfs_buftarg_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
+ struct list_head *dispose = arg;
+
+ /*
+ * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+ * If we fail to get the lock, just skip it.
+ */
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ spin_unlock(&bp->b_lock);
+ return LRU_ROTATE;
+ }
+
+ bp->b_state |= XFS_BSTATE_DISPOSE;
+ list_move(item, dispose);
+ spin_unlock(&bp->b_lock);
+ return LRU_REMOVED;
+}
+
+static unsigned long
+xfs_buftarg_shrink_scan(
struct shrinker *shrink,
struct shrink_control *sc)
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
- struct xfs_buf *bp;
- int nr_to_scan = sc->nr_to_scan;
LIST_HEAD(dispose);
+ unsigned long freed;
+ unsigned long nr_to_scan = sc->nr_to_scan;
- if (!nr_to_scan)
- return btp->bt_lru_nr;
-
- spin_lock(&btp->bt_lru_lock);
- while (!list_empty(&btp->bt_lru)) {
- if (nr_to_scan-- <= 0)
- break;
-
- bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
- /*
- * Decrement the b_lru_ref count unless the value is already
- * zero. If the value is already zero, we need to reclaim the
- * buffer, otherwise it gets another trip through the LRU.
- */
- if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
- list_move_tail(&bp->b_lru, &btp->bt_lru);
- continue;
- }
-
- /*
- * remove the buffer from the LRU now to avoid needing another
- * lock round trip inside xfs_buf_rele().
- */
- list_move(&bp->b_lru, &dispose);
- btp->bt_lru_nr--;
- bp->b_lru_flags |= _XBF_LRU_DISPOSE;
- }
- spin_unlock(&btp->bt_lru_lock);
+ freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+ &dispose, &nr_to_scan);
while (!list_empty(&dispose)) {
+ struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
- return btp->bt_lru_nr;
+ return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ return list_lru_count_node(&btp->bt_lru, sc->nid);
}
void
@@ -1587,6 +1593,7 @@ xfs_free_buftarg(
struct xfs_buftarg *btp)
{
unregister_shrinker(&btp->bt_shrinker);
+ list_lru_destroy(&btp->bt_lru);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
@@ -1660,12 +1667,16 @@ xfs_alloc_buftarg(
if (!btp->bt_bdi)
goto error;
- INIT_LIST_HEAD(&btp->bt_lru);
- spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+
+ if (list_lru_init(&btp->bt_lru))
+ goto error;
+
+ btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+ btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&btp->bt_shrinker);
return btp;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 433a12ed7b17..e65683361017 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
+#include <linux/list_lru.h>
/*
* Base types
@@ -59,7 +60,6 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
-#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
typedef unsigned int xfs_buf_flags_t;
@@ -78,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }, \
- { _XBF_LRU_DISPOSE, "LRU_DISPOSE" }
+ { _XBF_COMPOUND, "COMPOUND" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
typedef struct xfs_buftarg {
dev_t bt_dev;
@@ -92,9 +96,7 @@ typedef struct xfs_buftarg {
/* LRU control structures */
struct shrinker bt_shrinker;
- struct list_head bt_lru;
- spinlock_t bt_lru_lock;
- unsigned int bt_lru_nr;
+ struct list_lru bt_lru;
} xfs_buftarg_t;
struct xfs_buf;
@@ -137,7 +139,8 @@ typedef struct xfs_buf {
* bt_lru_lock and not by b_sema
*/
struct list_head b_lru; /* lru list */
- xfs_buf_flags_t b_lru_flags; /* internal lru status flags */
+ spinlock_t b_lock; /* internal state lock */
+ unsigned int b_state; /* internal state flags */
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3a944b198e35..f1d85cfc0a54 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -613,13 +613,28 @@ xfs_buf_item_unlock(
}
}
}
- if (clean || aborted) {
- if (atomic_dec_and_test(&bip->bli_refcount)) {
- ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp));
+
+ /*
+ * Clean buffers, by definition, cannot be in the AIL. However, aborted
+ * buffers may be dirty and hence in the AIL. Therefore if we are
+ * aborting a buffer and we've just taken the last refernce away, we
+ * have to check if it is in the AIL before freeing it. We need to free
+ * it in this case, because an aborted transaction has already shut the
+ * filesystem down and this is the last chance we will have to do so.
+ */
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+ if (clean)
+ xfs_buf_item_relse(bp);
+ else if (aborted) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ spin_lock(&lip->li_ailp->xa_lock);
+ xfs_trans_ail_delete(lip->li_ailp, lip,
+ SHUTDOWN_LOG_IO_ERROR);
+ }
xfs_buf_item_relse(bp);
}
- } else
- atomic_dec(&bip->bli_refcount);
+ }
if (!(flags & XFS_BLI_HOLD))
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index d4e59a4ff59f..20bf8e8002d6 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -635,6 +635,7 @@ xfs_da3_root_split(
xfs_trans_log_buf(tp, bp, 0, size - 1);
bp->b_ops = blk1->bp->b_ops;
+ xfs_trans_buf_copy_type(bp, blk1->bp);
blk1->bp = bp;
blk1->blkno = blkno;
@@ -1223,6 +1224,7 @@ xfs_da3_node_toosmall(
/* start with smaller blk num */
forward = nodehdr.forw < nodehdr.back;
for (i = 0; i < 2; forward = !forward, i++) {
+ struct xfs_da3_icnode_hdr thdr;
if (forward)
blkno = nodehdr.forw;
else
@@ -1235,10 +1237,10 @@ xfs_da3_node_toosmall(
return(error);
node = bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(&thdr, node);
xfs_trans_brelse(state->args->trans, bp);
- if (count - nodehdr.count >= 0)
+ if (count - thdr.count >= 0)
break; /* fits with at least 25% to spare */
}
if (i >= 2) {
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 0957aa98b6c0..12dad188939d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1158,7 +1158,7 @@ xfs_dir2_sf_to_block(
/*
* Create entry for .
*/
- dep = xfs_dir3_data_dot_entry_p(hdr);
+ dep = xfs_dir3_data_dot_entry_p(mp, hdr);
dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
@@ -1172,7 +1172,7 @@ xfs_dir2_sf_to_block(
/*
* Create entry for ..
*/
- dep = xfs_dir3_data_dotdot_entry_p(hdr);
+ dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
@@ -1183,7 +1183,7 @@ xfs_dir2_sf_to_block(
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
(char *)dep - (char *)hdr));
- offset = xfs_dir3_data_first_offset(hdr);
+ offset = xfs_dir3_data_first_offset(mp);
/*
* Loop over existing entries, stuff them in.
*/
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index a0961a61ac1a..9cf67381adf6 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -497,69 +497,58 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
/*
* Offsets of . and .. in data space (always block 0)
*
- * The macros are used for shortform directories as they have no headers to read
- * the magic number out of. Shortform directories need to know the size of the
- * data block header because the sfe embeds the block offset of the entry into
- * it so that it doesn't change when format conversion occurs. Bad Things Happen
- * if we don't follow this rule.
- *
* XXX: there is scope for significant optimisation of the logic here. Right
* now we are checking for "dir3 format" over and over again. Ideally we should
* only do it once for each operation.
*/
-#define XFS_DIR3_DATA_DOT_OFFSET(mp) \
- xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
-#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \
- (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1))
-#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \
- (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2))
-
static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dot_offset(struct xfs_mount *mp)
{
- return xfs_dir3_data_entry_offset(hdr);
+ return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
}
static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
{
- bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
- return xfs_dir3_data_dot_offset(hdr) +
- __xfs_dir3_data_entsize(dir3, 1);
+ return xfs_dir3_data_dot_offset(mp) +
+ xfs_dir3_data_entsize(mp, 1);
}
static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_first_offset(struct xfs_mount *mp)
{
- bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
- return xfs_dir3_data_dotdot_offset(hdr) +
- __xfs_dir3_data_entsize(dir3, 2);
+ return xfs_dir3_data_dotdot_offset(mp) +
+ xfs_dir3_data_entsize(mp, 2);
}
/*
* location of . and .. in data space (always block 0)
*/
static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dot_entry_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr)
{
return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_dot_offset(hdr));
+ ((char *)hdr + xfs_dir3_data_dot_offset(mp));
}
static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_dotdot_entry_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr)
{
return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr));
+ ((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
}
static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr)
+xfs_dir3_data_first_entry_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr)
{
return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_first_offset(hdr));
+ ((char *)hdr + xfs_dir3_data_first_offset(mp));
}
/*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 08984eeee159..1021c8356d08 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -180,6 +180,11 @@ xfs_dir3_leaf_check_int(
return true;
}
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
static bool
xfs_dir3_leaf_verify(
struct xfs_buf *bp,
@@ -191,24 +196,25 @@ xfs_dir3_leaf_verify(
ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+ __uint16_t magic3;
- if ((magic == XFS_DIR2_LEAF1_MAGIC &&
- leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) ||
- (magic == XFS_DIR2_LEAFN_MAGIC &&
- leafhdr.magic != XFS_DIR3_LEAFN_MAGIC))
- return false;
+ magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+ : XFS_DIR3_LEAFN_MAGIC;
+ if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+ return false;
if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
} else {
- if (leafhdr.magic != magic)
+ if (leaf->hdr.info.magic != cpu_to_be16(magic))
return false;
}
+
+ xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 8993ec17452c..8f84153e98a8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -119,9 +119,9 @@ xfs_dir2_sf_getdents(
* mp->m_dirdatablk.
*/
dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- XFS_DIR3_DATA_DOT_OFFSET(mp));
+ xfs_dir3_data_dot_offset(mp));
dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
+ xfs_dir3_data_dotdot_offset(mp));
/*
* Put . entry unless we're starting past it.
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index bb6e2848f473..3ef6d402084c 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -557,7 +557,7 @@ xfs_dir2_sf_addname_hard(
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp),
+ for (offset = xfs_dir3_data_first_offset(mp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp),
add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
@@ -640,7 +640,7 @@ xfs_dir2_sf_addname_pick(
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
size = xfs_dir3_data_entsize(mp, args->namelen);
- offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
+ offset = xfs_dir3_data_first_offset(mp);
sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
@@ -713,7 +713,7 @@ xfs_dir2_sf_check(
mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
+ offset = xfs_dir3_data_first_offset(mp);
ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 251c66632e5e..1ee776d477c3 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -64,7 +64,8 @@ int xfs_dqerror_mod = 33;
struct kmem_zone *xfs_qm_dqtrxzone;
static struct kmem_zone *xfs_qm_dqzone;
-static struct lock_class_key xfs_dquot_other_class;
+static struct lock_class_key xfs_dquot_group_class;
+static struct lock_class_key xfs_dquot_project_class;
/*
* This is called to free all the memory associated with a dquot
@@ -703,8 +704,20 @@ xfs_qm_dqread(
* Make sure group quotas have a different lock class than user
* quotas.
*/
- if (!(type & XFS_DQ_USER))
- lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+ switch (type) {
+ case XFS_DQ_USER:
+ /* uses the default lock class */
+ break;
+ case XFS_DQ_GROUP:
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
+ break;
+ case XFS_DQ_PROJ:
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
XFS_STATS_INC(xs_qm_dquot);
@@ -940,13 +953,8 @@ xfs_qm_dqput_final(
trace_xfs_dqput_free(dqp);
- mutex_lock(&qi->qi_lru_lock);
- if (list_empty(&dqp->q_lru)) {
- list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
- qi->qi_lru_count++;
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(xs_qm_dquot_unused);
- }
- mutex_unlock(&qi->qi_lru_lock);
/*
* If we just added a udquot to the freelist, then we want to release
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 60c6e1f12695..e838d84b4e85 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -142,7 +142,8 @@ xfs_qm_dqunpin_wait(
STATIC uint
xfs_qm_dquot_logitem_push(
struct xfs_log_item *lip,
- struct list_head *buffer_list)
+ struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock)
+ __acquires(&lip->li_ailp->xa_lock)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
struct xfs_buf *bp = NULL;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 86f559f6e5d3..e43708e2f080 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -160,7 +160,8 @@ xfs_extent_busy_update_extent(
struct xfs_extent_busy *busyp,
xfs_agblock_t fbno,
xfs_extlen_t flen,
- bool userdata)
+ bool userdata) __releases(&pag->pagb_lock)
+ __acquires(&pag->pagb_lock)
{
xfs_agblock_t fend = fbno + flen;
xfs_agblock_t bbno = busyp->bno;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 1edb5cc3e5f4..18272c766a50 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -515,7 +515,7 @@ typedef struct xfs_swapext
/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
-#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 16219b9c6790..474807a401c8 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -48,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
/*
* Allocate and initialise an xfs_inode.
*/
-STATIC struct xfs_inode *
+struct xfs_inode *
xfs_inode_alloc(
struct xfs_mount *mp,
xfs_ino_t ino)
@@ -98,7 +98,7 @@ xfs_inode_free_callback(
kmem_zone_free(xfs_inode_zone, ip);
}
-STATIC void
+void
xfs_inode_free(
struct xfs_inode *ip)
{
@@ -119,11 +119,6 @@ xfs_inode_free(
ip->i_itemp = NULL;
}
- /* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(!xfs_isiflocked(ip));
-
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
@@ -135,6 +130,10 @@ xfs_inode_free(
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!xfs_isiflocked(ip));
+
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
@@ -1167,7 +1166,7 @@ xfs_reclaim_inodes(
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
*/
-void
+long
xfs_reclaim_inodes_nr(
struct xfs_mount *mp,
int nr_to_scan)
@@ -1176,7 +1175,7 @@ xfs_reclaim_inodes_nr(
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
}
/*
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a89f7d791bd..9ed68bb750f5 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -42,11 +42,15 @@ struct xfs_eofblocks {
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
+/* recovery needs direct inode allocation capability */
+struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
+void xfs_inode_free(struct xfs_inode *ip);
+
void xfs_reclaim_worker(struct work_struct *work);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index e011d597f12f..63382d37f565 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -53,9 +53,8 @@ xfs_inobp_check(
i * mp->m_sb.sb_inodesize);
if (!dip->di_next_unlinked) {
xfs_alert(mp,
- "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
- bp);
- ASSERT(dip->di_next_unlinked);
+ "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+ i, (long long)bp->b_bn);
}
}
}
@@ -106,11 +105,10 @@ xfs_inode_buf_verify(
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
mp, dip);
#ifdef DEBUG
- xfs_emerg(mp,
+ xfs_alert(mp,
"bad inode magic/vsn daddr %lld #%d (magic=%x)",
(unsigned long long)bp->b_bn, i,
be16_to_cpu(dip->di_magic));
- ASSERT(0);
#endif
}
}
@@ -196,7 +194,7 @@ xfs_imap_to_bp(
return 0;
}
-STATIC void
+void
xfs_dinode_from_disk(
xfs_icdinode_t *to,
xfs_dinode_t *from)
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
index 599e6c0ca2a9..abba0ae8cf2d 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/xfs_inode_buf.h
@@ -32,17 +32,17 @@ struct xfs_imap {
ushort im_boffset; /* inode offset in block in bytes */
};
-int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
- struct xfs_imap *, struct xfs_dinode **,
- struct xfs_buf **, uint, uint);
-int xfs_iread(struct xfs_mount *, struct xfs_trans *,
- struct xfs_inode *, uint);
-void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void xfs_dinode_to_disk(struct xfs_dinode *,
- struct xfs_icdinode *);
+int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_imap *, struct xfs_dinode **,
+ struct xfs_buf **, uint, uint);
+int xfs_iread(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_inode *, uint);
+void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
#if defined(DEBUG)
-void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#else
#define xfs_inobp_check(mp, bp)
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bdebc21078d7..668e8f4ccf5e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -71,7 +71,7 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f = {0};
+ struct fd f = {NULL};
struct path path;
int error;
struct xfs_inode *ip;
@@ -456,12 +456,9 @@ xfs_attrlist_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
- if (!kbuf) {
- kbuf = kmem_zalloc_large(al_hreq.buflen);
- if (!kbuf)
- goto out_dput;
- }
+ kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+ if (!kbuf)
+ goto out_dput;
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -472,12 +469,9 @@ xfs_attrlist_by_handle(
if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
error = -EFAULT;
- out_kfree:
- if (is_vmalloc_addr(kbuf))
- kmem_free_large(kbuf);
- else
- kmem_free(kbuf);
- out_dput:
+out_kfree:
+ kmem_free(kbuf);
+out_dput:
dput(dentry);
return error;
}
@@ -495,12 +489,9 @@ xfs_attrmulti_attr_get(
if (*len > XATTR_SIZE_MAX)
return EINVAL;
- kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
- if (!kbuf) {
- kbuf = kmem_zalloc_large(*len);
- if (!kbuf)
- return ENOMEM;
- }
+ kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+ if (!kbuf)
+ return ENOMEM;
error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
if (error)
@@ -509,11 +500,8 @@ xfs_attrmulti_attr_get(
if (copy_to_user(ubuf, kbuf, *len))
error = EFAULT;
- out_kfree:
- if (is_vmalloc_addr(kbuf))
- kmem_free_large(kbuf);
- else
- kmem_free(kbuf);
+out_kfree:
+ kmem_free(kbuf);
return error;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index d3ab9534307f..f671f7e472ac 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -371,12 +371,9 @@ xfs_compat_attrlist_by_handle(
return PTR_ERR(dentry);
error = -ENOMEM;
- kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
- if (!kbuf) {
- kbuf = kmem_zalloc_large(al_hreq.buflen);
- if (!kbuf)
- goto out_dput;
- }
+ kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+ if (!kbuf)
+ goto out_dput;
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -387,12 +384,9 @@ xfs_compat_attrlist_by_handle(
if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
error = -EFAULT;
- out_kfree:
- if (is_vmalloc_addr(kbuf))
- kmem_free_large(kbuf);
- else
- kmem_free(kbuf);
- out_dput:
+out_kfree:
+ kmem_free(kbuf);
+out_dput:
dput(dentry);
return error;
}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b93e14b86754..084b3e1741fd 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -495,7 +495,7 @@ xfs_bulkstat(
/*
* Done, we're either out of filesystem or space to put the data.
*/
- kmem_free_large(irbuf);
+ kmem_free(irbuf);
*ubcountp = ubelem;
/*
* Found some inodes, return them now and return the error next time.
@@ -541,8 +541,9 @@ xfs_bulkstat_single(
* at the expense of the error case.
*/
- ino = (xfs_ino_t)*lastinop;
- error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
+ ino = *lastinop;
+ error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
+ NULL, &res);
if (error) {
/*
* Special case way failed, do it the "long" way
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5372d58ef93a..a2dea108071a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -257,7 +257,8 @@ xlog_grant_head_wait(
struct xlog *log,
struct xlog_grant_head *head,
struct xlog_ticket *tic,
- int need_bytes)
+ int need_bytes) __releases(&head->lock)
+ __acquires(&head->lock)
{
list_add_tail(&tic->t_queue, &head->waiters);
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index 31e3a06c4644..ca7e28a8ed31 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -474,6 +474,8 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
+#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
/*
@@ -487,7 +489,8 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
XFS_ILOG_UUID | XFS_ILOG_ADATA | \
- XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+ XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+ XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
XFS_ILOG_DBROOT)
@@ -499,7 +502,8 @@ typedef struct xfs_inode_log_format_64 {
XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
XFS_ILOG_DEV | XFS_ILOG_UUID | \
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
- XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
+ XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+ XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
static inline int xfs_ilog_fbroot(int w)
{
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7c0c1fdc728b..39797490a1f1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1585,6 +1585,7 @@ xlog_recover_add_to_trans(
"bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
+ kmem_free(ptr);
return XFS_ERROR(EIO);
}
@@ -1970,6 +1971,13 @@ xlog_recover_do_inode_buffer(
* magic number. If we don't recognise the magic number in the buffer, then
* return a LSN of -1 so that the caller knows it was an unrecognised block and
* so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
*/
static xfs_lsn_t
xlog_recover_get_buf_lsn(
@@ -1980,6 +1988,8 @@ xlog_recover_get_buf_lsn(
__uint16_t magic16;
__uint16_t magicda;
void *blk = bp->b_addr;
+ uuid_t *uuid;
+ xfs_lsn_t lsn = -1;
/* v4 filesystems always recover immediately */
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -1992,43 +2002,79 @@ xlog_recover_get_buf_lsn(
case XFS_ABTB_MAGIC:
case XFS_ABTC_MAGIC:
case XFS_IBT_CRC_MAGIC:
- case XFS_IBT_MAGIC:
- return be64_to_cpu(
- ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+ uuid = &btb->bb_u.s.bb_uuid;
+ break;
+ }
case XFS_BMAP_CRC_MAGIC:
- case XFS_BMAP_MAGIC:
- return be64_to_cpu(
- ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
+ case XFS_BMAP_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+ uuid = &btb->bb_u.l.bb_uuid;
+ break;
+ }
case XFS_AGF_MAGIC:
- return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ uuid = &((struct xfs_agf *)blk)->agf_uuid;
+ break;
case XFS_AGFL_MAGIC:
- return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+ break;
case XFS_AGI_MAGIC:
- return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ uuid = &((struct xfs_agi *)blk)->agi_uuid;
+ break;
case XFS_SYMLINK_MAGIC:
- return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+ break;
case XFS_DIR3_BLOCK_MAGIC:
case XFS_DIR3_DATA_MAGIC:
case XFS_DIR3_FREE_MAGIC:
- return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+ break;
case XFS_ATTR3_RMT_MAGIC:
- return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+ lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+ uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+ break;
case XFS_SB_MAGIC:
- return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
+ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ break;
default:
break;
}
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
switch (magicda) {
case XFS_DIR3_LEAF1_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
case XFS_DA3_NODE_MAGIC:
- return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+ break;
default:
break;
}
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
/*
* We do individual object checks on dquot and inode buffers as they
* have their own individual LSN records. Also, we could have a stale
@@ -2629,6 +2675,82 @@ out_release:
return error;
}
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one. This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_format *in_f,
+ struct list_head *buffer_list)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+ ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+ if (!ip)
+ return ENOMEM;
+
+ /* instantiate the inode */
+ xfs_dinode_from_disk(&ip->i_d, dip);
+ ASSERT(ip->i_d.di_version >= 3);
+
+ error = xfs_iformat_fork(ip, dip);
+ if (error)
+ goto out_free_ip;
+
+
+ if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+ if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+out_free_ip:
+ xfs_inode_free(ip);
+ return error;
+}
+
STATIC int
xlog_recover_inode_pass2(
struct xlog *log,
@@ -2681,8 +2803,7 @@ xlog_recover_inode_pass2(
error = bp->b_error;
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
- xfs_buf_relse(bp);
- goto error;
+ goto out_release;
}
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2692,30 +2813,31 @@ xlog_recover_inode_pass2(
* like an inode!
*/
if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
dicp = item->ri_buf[1].i_addr;
if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
__func__, item, in_f->ilf_ino);
XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
/*
* If the inode has an LSN in it, recover the inode only if it's less
- * than the lsn of the transaction we are replaying.
+ * than the lsn of the transaction we are replaying. Note: we still
+ * need to replay an owner change even though the inode is more recent
+ * than the transaction as there is no guarantee that all the btree
+ * blocks are more recent than this transaction, too.
*/
if (dip->di_version >= 3) {
xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
@@ -2723,7 +2845,7 @@ xlog_recover_inode_pass2(
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
- goto out_release;
+ goto out_owner_change;
}
}
@@ -2745,10 +2867,9 @@ xlog_recover_inode_pass2(
dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
- xfs_buf_relse(bp);
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
- goto error;
+ goto out_release;
}
}
@@ -2760,13 +2881,12 @@ xlog_recover_inode_pass2(
(dicp->di_format != XFS_DINODE_FMT_BTREE)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad regular inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
} else if (unlikely(S_ISDIR(dicp->di_mode))) {
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2774,19 +2894,17 @@ xlog_recover_inode_pass2(
(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad dir inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
__func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
}
if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2794,29 +2912,27 @@ xlog_recover_inode_pass2(
dicp->di_nextents + dicp->di_anextents,
dicp->di_nblocks);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
isize = xfs_icdinode_size(dicp->di_version);
if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr 0x%p",
__func__, item->ri_buf[1].i_len, item);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
/* The core is in in-core format */
@@ -2842,7 +2958,7 @@ xlog_recover_inode_pass2(
}
if (in_f->ilf_size == 2)
- goto write_inode_buffer;
+ goto out_owner_change;
len = item->ri_buf[2].i_len;
src = item->ri_buf[2].i_addr;
ASSERT(in_f->ilf_size <= 4);
@@ -2903,13 +3019,15 @@ xlog_recover_inode_pass2(
default:
xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- xfs_buf_relse(bp);
error = EIO;
- goto error;
+ goto out_release;
}
}
-write_inode_buffer:
+out_owner_change:
+ if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+ error = xfs_recover_inode_owner_change(mp, dip, in_f,
+ buffer_list);
/* re-generate the checksum. */
xfs_dinode_calc_crc(log->l_mp, dip);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6218a0aeeeea..3e6c2e6c9cd2 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -51,8 +51,9 @@
*/
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
* currently is the only interface into the radix tree code that allows
@@ -203,12 +204,9 @@ xfs_qm_dqpurge(
* We move dquots to the freelist as soon as their reference count
* hits zero, so it really should be on the freelist here.
*/
- mutex_lock(&qi->qi_lru_lock);
ASSERT(!list_empty(&dqp->q_lru));
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
+ list_lru_del(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(xs_qm_dquot_unused);
- mutex_unlock(&qi->qi_lru_lock);
xfs_qm_dqdestroy(dqp);
@@ -680,6 +678,143 @@ xfs_qm_calc_dquots_per_chunk(
return ndquots;
}
+struct xfs_qm_isolate {
+ struct list_head buffers;
+ struct list_head dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+ struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct xfs_dquot *dqp = container_of(item,
+ struct xfs_dquot, q_lru);
+ struct xfs_qm_isolate *isol = arg;
+
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_miss_busy;
+
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
+ XFS_STATS_INC(xs_qm_dqwants);
+
+ trace_xfs_dqreclaim_want(dqp);
+ list_del_init(&dqp->q_lru);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ return LRU_REMOVED;
+ }
+
+ /*
+ * If the dquot is dirty, flush it. If it's already being flushed, just
+ * skip it so there is time for the IO to complete before we try to
+ * reclaim it again on the next LRU pass.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ xfs_dqunlock(dqp);
+ goto out_miss_busy;
+ }
+
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ trace_xfs_dqreclaim_dirty(dqp);
+
+ /* we have to drop the LRU lock to flush the dquot */
+ spin_unlock(lru_lock);
+
+ error = xfs_qm_dqflush(dqp, &bp);
+ if (error) {
+ xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+ __func__, dqp);
+ goto out_unlock_dirty;
+ }
+
+ xfs_buf_delwri_queue(bp, &isol->buffers);
+ xfs_buf_relse(bp);
+ goto out_unlock_dirty;
+ }
+ xfs_dqfunlock(dqp);
+
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
+
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_lru, &isol->dispose);
+ XFS_STATS_DEC(xs_qm_dquot_unused);
+ trace_xfs_dqreclaim_done(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaims);
+ return LRU_REMOVED;
+
+out_miss_busy:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ return LRU_SKIP;
+
+out_unlock_dirty:
+ trace_xfs_dqreclaim_busy(dqp);
+ XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ xfs_dqunlock(dqp);
+ spin_lock(lru_lock);
+ return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+ struct xfs_qm_isolate isol;
+ unsigned long freed;
+ int error;
+ unsigned long nr_to_scan = sc->nr_to_scan;
+
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ return 0;
+
+ INIT_LIST_HEAD(&isol.buffers);
+ INIT_LIST_HEAD(&isol.dispose);
+
+ freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+ &nr_to_scan);
+
+ error = xfs_buf_delwri_submit(&isol.buffers);
+ if (error)
+ xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+ while (!list_empty(&isol.dispose)) {
+ struct xfs_dquot *dqp;
+
+ dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+ list_del_init(&dqp->q_lru);
+ xfs_qm_dqfree_one(dqp);
+ }
+
+ return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_quotainfo *qi = container_of(shrink,
+ struct xfs_quotainfo, qi_shrinker);
+
+ return list_lru_count_node(&qi->qi_lru, sc->nid);
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -696,11 +831,18 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+ if ((error = list_lru_init(&qinf->qi_lru))) {
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
+ }
+
/*
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
if ((error = xfs_qm_init_quotainos(mp))) {
+ list_lru_destroy(&qinf->qi_lru);
kmem_free(qinf);
mp->m_quotainfo = NULL;
return error;
@@ -711,10 +853,6 @@ xfs_qm_init_quotainfo(
INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
mutex_init(&qinf->qi_tree_lock);
- INIT_LIST_HEAD(&qinf->qi_lru_list);
- qinf->qi_lru_count = 0;
- mutex_init(&qinf->qi_lru_lock);
-
/* mutex used to serialize quotaoffs */
mutex_init(&qinf->qi_quotaofflock);
@@ -779,8 +917,10 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
- qinf->qi_shrinker.shrink = xfs_qm_shake;
+ qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+ qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+ qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&qinf->qi_shrinker);
return 0;
}
@@ -801,6 +941,7 @@ xfs_qm_destroy_quotainfo(
ASSERT(qi != NULL);
unregister_shrinker(&qi->qi_shrinker);
+ list_lru_destroy(&qi->qi_lru);
if (qi->qi_uquotaip) {
IRELE(qi->qi_uquotaip);
@@ -1599,132 +1740,6 @@ xfs_qm_dqfree_one(
xfs_qm_dqdestroy(dqp);
}
-STATIC void
-xfs_qm_dqreclaim_one(
- struct xfs_dquot *dqp,
- struct list_head *buffer_list,
- struct list_head *dispose_list)
-{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- int error;
-
- if (!xfs_dqlock_nowait(dqp))
- goto out_move_tail;
-
- /*
- * This dquot has acquired a reference in the meantime remove it from
- * the freelist and try again.
- */
- if (dqp->q_nrefs) {
- xfs_dqunlock(dqp);
-
- trace_xfs_dqreclaim_want(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
-
- list_del_init(&dqp->q_lru);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
- return;
- }
-
- /*
- * Try to grab the flush lock. If this dquot is in the process of
- * getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto out_unlock_move_tail;
-
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- goto out_unlock_move_tail;
- }
-
- xfs_buf_delwri_queue(bp, buffer_list);
- xfs_buf_relse(bp);
- /*
- * Give the dquot another try on the freelist, as the
- * flushing will take some time.
- */
- goto out_unlock_move_tail;
- }
- xfs_dqfunlock(dqp);
-
- /*
- * Prevent lookups now that we are past the point of no return.
- */
- dqp->dq_flags |= XFS_DQ_FREEING;
- xfs_dqunlock(dqp);
-
- ASSERT(dqp->q_nrefs == 0);
- list_move_tail(&dqp->q_lru, dispose_list);
- qi->qi_lru_count--;
- XFS_STATS_DEC(xs_qm_dquot_unused);
-
- trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
- return;
-
- /*
- * Move the dquot to the tail of the list so that we don't spin on it.
- */
-out_unlock_move_tail:
- xfs_dqunlock(dqp);
-out_move_tail:
- list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
-{
- struct xfs_quotainfo *qi =
- container_of(shrink, struct xfs_quotainfo, qi_shrinker);
- int nr_to_scan = sc->nr_to_scan;
- LIST_HEAD (buffer_list);
- LIST_HEAD (dispose_list);
- struct xfs_dquot *dqp;
- int error;
-
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
- return 0;
- if (!nr_to_scan)
- goto out;
-
- mutex_lock(&qi->qi_lru_lock);
- while (!list_empty(&qi->qi_lru_list)) {
- if (nr_to_scan-- <= 0)
- break;
- dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
- q_lru);
- xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
- }
- mutex_unlock(&qi->qi_lru_lock);
-
- error = xfs_buf_delwri_submit(&buffer_list);
- if (error)
- xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
- while (!list_empty(&dispose_list)) {
- dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
- list_del_init(&dqp->q_lru);
- xfs_qm_dqfree_one(dqp);
- }
-
-out:
- return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
-}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 670cd4464070..2b602df9c242 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -49,9 +49,7 @@ typedef struct xfs_quotainfo {
struct xfs_inode *qi_uquotaip; /* user quota inode */
struct xfs_inode *qi_gquotaip; /* group quota inode */
struct xfs_inode *qi_pquotaip; /* project quota inode */
- struct list_head qi_lru_list;
- struct mutex qi_lru_lock;
- int qi_lru_count;
+ struct list_lru qi_lru;
int qi_dquots;
time_t qi_btimelimit; /* limit for blks timer */
time_t qi_itimelimit; /* limit for inodes timer */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 979a77d4b87d..15188cc99449 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1535,19 +1535,21 @@ xfs_fs_mount(
return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
}
-static int
+static long
xfs_fs_nr_cached_objects(
- struct super_block *sb)
+ struct super_block *sb,
+ int nid)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
-static void
+static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- int nr_to_scan)
+ long nr_to_scan,
+ int nid)
{
- xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+ return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
}
static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 2f2a7c005be2..f622a97a7e33 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -41,6 +41,7 @@
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_symlink.h"
+#include "xfs_buf_item.h"
/* ----- Kernel only functions below ----- */
STATIC int
@@ -363,6 +364,7 @@ xfs_symlink(
pathlen -= byte_cnt;
offset += byte_cnt;
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
(char *)bp->b_addr);
}