diff options
Diffstat (limited to 'fs/xfs')
37 files changed, 880 insertions, 512 deletions
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 4a7286c1dc80..a02cfb9e3bce 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -27,8 +27,6 @@ /* * Greedy allocation. May fail and may return vmalloced memory. - * - * Must be freed using kmem_free_large. */ void * kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) @@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) void *ptr; size_t kmsize = maxsize; - while (!(ptr = kmem_zalloc_large(kmsize))) { + while (!(ptr = vzalloc(kmsize))) { if ((kmsize >>= 1) <= minsize) kmsize = minsize; } @@ -75,6 +73,17 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return ptr; } +void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + return vzalloc(size); +} + void kmem_free(const void *ptr) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index b2f2620f9a87..3a7371cab508 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -57,17 +57,10 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); -static inline void *kmem_zalloc_large(size_t size) -{ - return vzalloc(size); -} -static inline void kmem_free_large(void *ptr) -{ - vfree(ptr); -} extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 69518960b2ba..0e2f37efedd0 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(len, GFP_KERNEL); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -175,10 +175,10 @@ xfs_get_acl(struct inode *inode, int type) if (IS_ERR(acl)) goto out; - out_update_cache: +out_update_cache: set_cached_acl(inode, type, acl); - out: - kfree(xfs_acl); +out: + kmem_free(xfs_acl); return acl; } @@ -209,7 +209,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) struct xfs_acl *xfs_acl; int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(len, GFP_KERNEL); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); if (!xfs_acl) return -ENOMEM; @@ -222,7 +222,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); - kfree(xfs_acl); + kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 977da0ec6604..e51e581454e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1582,7 +1582,7 @@ xfs_vm_write_begin( unlock_page(page); if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, pos + len, i_size_read(inode)); + truncate_pagecache(inode, i_size_read(inode)); page_cache_release(page); page = NULL; @@ -1618,7 +1618,7 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, to, isize); + truncate_pagecache(inode, isize); xfs_vm_kill_delalloc_range(inode, isize, to); } } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 92b830901d60..f47e65c30be6 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4450,7 +4450,7 @@ xfs_bmapi_write( { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index cf3bc76710c3..bb8de8e399c4 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -925,3 +925,47 @@ xfs_bmdr_maxrecs( return blocklen / sizeof(xfs_bmdr_rec_t); return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); } + +/* + * Change the owner of a btree format fork fo the inode passed in. Change it to + * the owner of that is passed in so that we can change owners before or after + * we switch forks between inodes. The operation that the caller is doing will + * determine whether is needs to change owner before or after the switch. + * + * For demand paged transactional modification, the fork switch should be done + * after reading in all the blocks, modifying them and pinning them in the + * transaction. For modification when the buffers are already pinned in memory, + * the fork switch can be done before changing the owner as we won't need to + * validate the owner until the btree buffers are unpinned and writes can occur + * again. + * + * For recovery based ownership change, there is no transactional context and + * so a buffer list must be supplied so that we can record the buffers that we + * modified for the caller to issue IO on. + */ +int +xfs_bmbt_change_owner( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_ino_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(tp || buffer_list); + ASSERT(!(tp && buffer_list)); + if (whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); + else + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + + cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); + if (!cur) + return ENOMEM; + + error = xfs_btree_change_owner(cur, new_owner, buffer_list); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + return error; +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 1b726d626941..e367461a638e 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -236,6 +236,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_ino_t new_owner, + struct list_head *buffer_list); + extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 541d59f5e658..97f952caea74 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -612,13 +612,9 @@ xfs_getbmap( if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) return XFS_ERROR(ENOMEM); - out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); - if (!out) { - out = kmem_zalloc_large(bmv->bmv_count * - sizeof(struct getbmapx)); - if (!out) - return XFS_ERROR(ENOMEM); - } + out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); + if (!out) + return XFS_ERROR(ENOMEM); xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { @@ -754,10 +750,7 @@ xfs_getbmap( break; } - if (is_vmalloc_addr(out)) - kmem_free_large(out); - else - kmem_free(out); + kmem_free(out); return error; } @@ -1789,14 +1782,6 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; - /* - * We have no way of updating owner information in the BMBT blocks for - * each inode on CRC enabled filesystems, so to avoid corrupting the - * this metadata we simply don't allow extent swaps to occur. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return XFS_ERROR(EINVAL); - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); @@ -1920,6 +1905,42 @@ xfs_swap_extents( goto out_trans_cancel; } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Before we've swapped the forks, lets set the owners of the forks + * appropriately. We have to do this as we are demand paging the btree + * buffers, and so the validation done on read will expect the owner + * field to be correctly set. Once we change the owners, we can swap the + * inode forks. + * + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + if (ip->i_d.di_version == 3 && + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + target_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + if (tip->i_d.di_version == 3 && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + src_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + /* * Swap the data forks of the inodes */ @@ -1957,7 +1978,6 @@ xfs_swap_extents( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - src_log_flags = XFS_ILOG_CORE; switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the @@ -1971,11 +1991,12 @@ xfs_swap_extents( src_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_d.di_version < 3 || + (src_log_flags & XFS_ILOG_DOWNER)); src_log_flags |= XFS_ILOG_DBROOT; break; } - target_log_flags = XFS_ILOG_CORE; switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the @@ -1990,13 +2011,11 @@ xfs_swap_extents( break; case XFS_DINODE_FMT_BTREE: target_log_flags |= XFS_ILOG_DBROOT; + ASSERT(tip->i_d.di_version < 3 || + (target_log_flags & XFS_ILOG_DOWNER)); break; } - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 7a2b4da3c0db..5690e102243d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -855,6 +855,41 @@ xfs_btree_readahead( return xfs_btree_readahead_sblock(cur, lr, block); } +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +/* + * Readahead @count btree blocks at the given @ptr location. + * + * We don't need to care about long or short form btrees here as we have a + * method of converting the ptr directly to a daddr available to us. + */ +STATIC void +xfs_btree_readahead_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + xfs_extlen_t count) +{ + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, + xfs_btree_ptr_to_daddr(cur, ptr), + cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); +} + /* * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. @@ -1073,24 +1108,6 @@ xfs_btree_buf_to_ptr( } } -STATIC xfs_daddr_t -xfs_btree_ptr_to_daddr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); - - return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); - } else { - ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); - ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); - - return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(ptr->s)); - } -} - STATIC void xfs_btree_set_refs( struct xfs_btree_cur *cur, @@ -3869,3 +3886,120 @@ xfs_btree_get_rec( *stat = 1; return 0; } + +/* + * Change the owner of a btree. + * + * The mechanism we use here is ordered buffer logging. Because we don't know + * how many buffers were are going to need to modify, we don't really want to + * have to make transaction reservations for the worst case of every buffer in a + * full size btree as that may be more space that we can fit in the log.... + * + * We do the btree walk in the most optimal manner possible - we have sibling + * pointers so we can just walk all the blocks on each level from left to right + * in a single pass, and then move to the next level and do the same. We can + * also do readahead on the sibling pointers to get IO moving more quickly, + * though for slow disks this is unlikely to make much difference to performance + * as the amount of CPU work we have to do before moving to the next block is + * relatively small. + * + * For each btree block that we load, modify the owner appropriately, set the + * buffer as an ordered buffer and log it appropriately. We need to ensure that + * we mark the region we change dirty so that if the buffer is relogged in + * a subsequent transaction the changes we make here as an ordered buffer are + * correctly relogged in that transaction. If we are in recovery context, then + * just queue the modified buffer as delayed write buffer so the transaction + * recovery completion writes the changes to disk. + */ +static int +xfs_btree_block_change_owner( + struct xfs_btree_cur *cur, + int level, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* modify the owner */ + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + else + block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + + /* + * If the block is a root block hosted in an inode, we might not have a + * buffer pointer here and we shouldn't attempt to log the change as the + * information is already held in the inode and discarded when the root + * block is formatted into the on-disk inode fork. We still change it, + * though, so everything is consistent in memory. + */ + if (bp) { + if (cur->bc_tp) { + xfs_trans_ordered_buf(cur->bc_tp, bp); + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + } else { + xfs_buf_delwri_queue(bp, buffer_list); + } + } else { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + } + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + +int +xfs_btree_change_owner( + struct xfs_btree_cur *cur, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_block_change_owner(cur, level, + new_owner, + buffer_list); + } while (!error); + + if (error != ENOENT) + return error; + } + + return 0; +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index c8473c7ef45e..06729b67ad58 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -121,15 +121,18 @@ union xfs_btree_rec { /* * For logging record fields. */ -#define XFS_BB_MAGIC 0x01 -#define XFS_BB_LEVEL 0x02 -#define XFS_BB_NUMRECS 0x04 -#define XFS_BB_LEFTSIB 0x08 -#define XFS_BB_RIGHTSIB 0x10 -#define XFS_BB_BLKNO 0x20 +#define XFS_BB_MAGIC (1 << 0) +#define XFS_BB_LEVEL (1 << 1) +#define XFS_BB_NUMRECS (1 << 2) +#define XFS_BB_LEFTSIB (1 << 3) +#define XFS_BB_RIGHTSIB (1 << 4) +#define XFS_BB_BLKNO (1 << 5) +#define XFS_BB_LSN (1 << 6) +#define XFS_BB_UUID (1 << 7) +#define XFS_BB_OWNER (1 << 8) #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) -#define XFS_BB_NUM_BITS_CRC 8 +#define XFS_BB_NUM_BITS_CRC 9 #define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* @@ -442,6 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, + struct list_head *buffer_list); /* * btree block CRC helpers diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c06823fe10d3..263470075ea2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -81,54 +81,6 @@ xfs_buf_vmap_len( } /* - * xfs_buf_lru_add - add a buffer to the LRU. - * - * The LRU takes a new reference to the buffer so that it will only be freed - * once the shrinker takes the buffer off the LRU. - */ -STATIC void -xfs_buf_lru_add( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - spin_lock(&btp->bt_lru_lock); - if (list_empty(&bp->b_lru)) { - atomic_inc(&bp->b_hold); - list_add_tail(&bp->b_lru, &btp->bt_lru); - btp->bt_lru_nr++; - bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* - * xfs_buf_lru_del - remove a buffer from the LRU - * - * The unlocked check is safe here because it only occurs when there are not - * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there - * to optimise the shrinker removing the buffer from the LRU and calling - * xfs_buf_free(). i.e. it removes an unnecessary round trip on the - * bt_lru_lock. - */ -STATIC void -xfs_buf_lru_del( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - if (list_empty(&bp->b_lru)) - return; - - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -151,20 +103,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - atomic_set(&(bp)->b_lru_ref, 0); - if (!list_empty(&bp->b_lru)) { - struct xfs_buftarg *btp = bp->b_target; + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru) && - !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - atomic_dec(&bp->b_hold); - } - spin_unlock(&btp->bt_lru_lock); - } ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); } static int @@ -228,6 +174,7 @@ _xfs_buf_alloc( INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); bp->b_target = target; bp->b_flags = flags; @@ -917,12 +864,33 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (!(bp->b_flags & XBF_STALE) && - atomic_read(&bp->b_lru_ref)) { - xfs_buf_lru_add(bp); + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); spin_unlock(&pag->pag_buf_lock); } else { - xfs_buf_lru_del(bp); + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -1502,83 +1470,121 @@ xfs_buf_iomove( * returned. These buffers will have an elevated hold count, so wait on those * while freeing all the buffers only held by the LRU. */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_buf *bp; + LIST_HEAD(dispose); + int loop = 0; -restart: - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - if (atomic_read(&bp->b_hold) > 1) { - trace_xfs_buf_wait_buftarg(bp, _RET_IP_); - list_move_tail(&bp->b_lru, &btp->bt_lru); - spin_unlock(&btp->bt_lru_lock); - delay(100); - goto restart; + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); } - /* - * clear the LRU reference count so the buffer doesn't get - * ignored in xfs_buf_rele(). - */ - atomic_set(&bp->b_lru_ref, 0); - spin_unlock(&btp->bt_lru_lock); - xfs_buf_rele(bp); - spin_lock(&btp->bt_lru_lock); + if (loop++ != 0) + delay(100); } - spin_unlock(&btp->bt_lru_lock); } -int -xfs_buftarg_shrink( +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - struct xfs_buf *bp; - int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); + unsigned long freed; + unsigned long nr_to_scan = sc->nr_to_scan; - if (!nr_to_scan) - return btp->bt_lru_nr; - - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - if (nr_to_scan-- <= 0) - break; - - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - - /* - * Decrement the b_lru_ref count unless the value is already - * zero. If the value is already zero, we need to reclaim the - * buffer, otherwise it gets another trip through the LRU. - */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { - list_move_tail(&bp->b_lru, &btp->bt_lru); - continue; - } - - /* - * remove the buffer from the LRU now to avoid needing another - * lock round trip inside xfs_buf_rele(). - */ - list_move(&bp->b_lru, &dispose); - btp->bt_lru_nr--; - bp->b_lru_flags |= _XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); + freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, + &dispose, &nr_to_scan); while (!list_empty(&dispose)) { + struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); xfs_buf_rele(bp); } - return btp->bt_lru_nr; + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_count_node(&btp->bt_lru, sc->nid); } void @@ -1587,6 +1593,7 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); @@ -1660,12 +1667,16 @@ xfs_alloc_buftarg( if (!btp->bt_bdi) goto error; - INIT_LIST_HEAD(&btp->bt_lru); - spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - btp->bt_shrinker.shrink = xfs_buftarg_shrink; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&btp->bt_shrinker); return btp; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 433a12ed7b17..e65683361017 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/uio.h> +#include <linux/list_lru.h> /* * Base types @@ -59,7 +60,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -78,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } + { _XBF_COMPOUND, "COMPOUND" } + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ typedef struct xfs_buftarg { dev_t bt_dev; @@ -92,9 +96,7 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; - struct list_head bt_lru; - spinlock_t bt_lru_lock; - unsigned int bt_lru_nr; + struct list_lru bt_lru; } xfs_buftarg_t; struct xfs_buf; @@ -137,7 +139,8 @@ typedef struct xfs_buf { * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ - xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3a944b198e35..f1d85cfc0a54 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -613,13 +613,28 @@ xfs_buf_item_unlock( } } } - if (clean || aborted) { - if (atomic_dec_and_test(&bip->bli_refcount)) { - ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp)); + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&lip->li_ailp->xa_lock); + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } xfs_buf_item_relse(bp); } - } else - atomic_dec(&bip->bli_refcount); + } if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index d4e59a4ff59f..20bf8e8002d6 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -635,6 +635,7 @@ xfs_da3_root_split( xfs_trans_log_buf(tp, bp, 0, size - 1); bp->b_ops = blk1->bp->b_ops; + xfs_trans_buf_copy_type(bp, blk1->bp); blk1->bp = bp; blk1->blkno = blkno; @@ -1223,6 +1224,7 @@ xfs_da3_node_toosmall( /* start with smaller blk num */ forward = nodehdr.forw < nodehdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_da3_icnode_hdr thdr; if (forward) blkno = nodehdr.forw; else @@ -1235,10 +1237,10 @@ xfs_da3_node_toosmall( return(error); node = bp->b_addr; - xfs_da3_node_hdr_from_disk(&nodehdr, node); + xfs_da3_node_hdr_from_disk(&thdr, node); xfs_trans_brelse(state->args->trans, bp); - if (count - nodehdr.count >= 0) + if (count - thdr.count >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 0957aa98b6c0..12dad188939d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1158,7 +1158,7 @@ xfs_dir2_sf_to_block( /* * Create entry for . */ - dep = xfs_dir3_data_dot_entry_p(hdr); + dep = xfs_dir3_data_dot_entry_p(mp, hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; @@ -1172,7 +1172,7 @@ xfs_dir2_sf_to_block( /* * Create entry for .. */ - dep = xfs_dir3_data_dotdot_entry_p(hdr); + dep = xfs_dir3_data_dotdot_entry_p(mp, hdr); dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; @@ -1183,7 +1183,7 @@ xfs_dir2_sf_to_block( blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)hdr)); - offset = xfs_dir3_data_first_offset(hdr); + offset = xfs_dir3_data_first_offset(mp); /* * Loop over existing entries, stuff them in. */ diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a0961a61ac1a..9cf67381adf6 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -497,69 +497,58 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) /* * Offsets of . and .. in data space (always block 0) * - * The macros are used for shortform directories as they have no headers to read - * the magic number out of. Shortform directories need to know the size of the - * data block header because the sfe embeds the block offset of the entry into - * it so that it doesn't change when format conversion occurs. Bad Things Happen - * if we don't follow this rule. - * * XXX: there is scope for significant optimisation of the logic here. Right * now we are checking for "dir3 format" over and over again. Ideally we should * only do it once for each operation. */ -#define XFS_DIR3_DATA_DOT_OFFSET(mp) \ - xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) -#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ - (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1)) -#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ - (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2)) - static inline xfs_dir2_data_aoff_t -xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dot_offset(struct xfs_mount *mp) { - return xfs_dir3_data_entry_offset(hdr); + return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); } static inline xfs_dir2_data_aoff_t -xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dotdot_offset(struct xfs_mount *mp) { - bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - return xfs_dir3_data_dot_offset(hdr) + - __xfs_dir3_data_entsize(dir3, 1); + return xfs_dir3_data_dot_offset(mp) + + xfs_dir3_data_entsize(mp, 1); } static inline xfs_dir2_data_aoff_t -xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_first_offset(struct xfs_mount *mp) { - bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - return xfs_dir3_data_dotdot_offset(hdr) + - __xfs_dir3_data_entsize(dir3, 2); + return xfs_dir3_data_dotdot_offset(mp) + + xfs_dir3_data_entsize(mp, 2); } /* * location of . and .. in data space (always block 0) */ static inline struct xfs_dir2_data_entry * -xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dot_entry_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) { return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_dot_offset(hdr)); + ((char *)hdr + xfs_dir3_data_dot_offset(mp)); } static inline struct xfs_dir2_data_entry * -xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dotdot_entry_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) { return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr)); + ((char *)hdr + xfs_dir3_data_dotdot_offset(mp)); } static inline struct xfs_dir2_data_entry * -xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_first_entry_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) { return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_first_offset(hdr)); + ((char *)hdr + xfs_dir3_data_first_offset(mp)); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 08984eeee159..1021c8356d08 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -180,6 +180,11 @@ xfs_dir3_leaf_check_int( return true; } +/* + * We verify the magic numbers before decoding the leaf header so that on debug + * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due + * to incorrect magic numbers. + */ static bool xfs_dir3_leaf_verify( struct xfs_buf *bp, @@ -191,24 +196,25 @@ xfs_dir3_leaf_verify( ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + __uint16_t magic3; - if ((magic == XFS_DIR2_LEAF1_MAGIC && - leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) || - (magic == XFS_DIR2_LEAFN_MAGIC && - leafhdr.magic != XFS_DIR3_LEAFN_MAGIC)) - return false; + magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC + : XFS_DIR3_LEAFN_MAGIC; + if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) + return false; if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; } else { - if (leafhdr.magic != magic) + if (leaf->hdr.info.magic != cpu_to_be16(magic)) return false; } + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8993ec17452c..8f84153e98a8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -119,9 +119,9 @@ xfs_dir2_sf_getdents( * mp->m_dirdatablk. */ dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR3_DATA_DOT_OFFSET(mp)); + xfs_dir3_data_dot_offset(mp)); dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); + xfs_dir3_data_dotdot_offset(mp)); /* * Put . entry unless we're starting past it. diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index bb6e2848f473..3ef6d402084c 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -557,7 +557,7 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp), + for (offset = xfs_dir3_data_first_offset(mp), oldsfep = xfs_dir2_sf_firstentry(oldsfp), add_datasize = xfs_dir3_data_entsize(mp, args->namelen), eof = (char *)oldsfep == &buf[old_isize]; @@ -640,7 +640,7 @@ xfs_dir2_sf_addname_pick( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir3_data_entsize(mp, args->namelen); - offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); + offset = xfs_dir3_data_first_offset(mp); sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -713,7 +713,7 @@ xfs_dir2_sf_check( mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); + offset = xfs_dir3_data_first_offset(mp); ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 251c66632e5e..1ee776d477c3 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -64,7 +64,8 @@ int xfs_dqerror_mod = 33; struct kmem_zone *xfs_qm_dqtrxzone; static struct kmem_zone *xfs_qm_dqzone; -static struct lock_class_key xfs_dquot_other_class; +static struct lock_class_key xfs_dquot_group_class; +static struct lock_class_key xfs_dquot_project_class; /* * This is called to free all the memory associated with a dquot @@ -703,8 +704,20 @@ xfs_qm_dqread( * Make sure group quotas have a different lock class than user * quotas. */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + switch (type) { + case XFS_DQ_USER: + /* uses the default lock class */ + break; + case XFS_DQ_GROUP: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); + break; + case XFS_DQ_PROJ: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); + break; + default: + ASSERT(0); + break; + } XFS_STATS_INC(xs_qm_dquot); @@ -940,13 +953,8 @@ xfs_qm_dqput_final( trace_xfs_dqput_free(dqp); - mutex_lock(&qi->qi_lru_lock); - if (list_empty(&dqp->q_lru)) { - list_add_tail(&dqp->q_lru, &qi->qi_lru_list); - qi->qi_lru_count++; + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(xs_qm_dquot_unused); - } - mutex_unlock(&qi->qi_lru_lock); /* * If we just added a udquot to the freelist, then we want to release diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 60c6e1f12695..e838d84b4e85 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -142,7 +142,8 @@ xfs_qm_dqunpin_wait( STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, - struct list_head *buffer_list) + struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_buf *bp = NULL; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 86f559f6e5d3..e43708e2f080 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -160,7 +160,8 @@ xfs_extent_busy_update_extent( struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, - bool userdata) + bool userdata) __releases(&pag->pagb_lock) + __acquires(&pag->pagb_lock) { xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 1edb5cc3e5f4..18272c766a50 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -515,7 +515,7 @@ typedef struct xfs_swapext /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) -#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 16219b9c6790..474807a401c8 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -48,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, /* * Allocate and initialise an xfs_inode. */ -STATIC struct xfs_inode * +struct xfs_inode * xfs_inode_alloc( struct xfs_mount *mp, xfs_ino_t ino) @@ -98,7 +98,7 @@ xfs_inode_free_callback( kmem_zone_free(xfs_inode_zone, ip); } -STATIC void +void xfs_inode_free( struct xfs_inode *ip) { @@ -119,11 +119,6 @@ xfs_inode_free( ip->i_itemp = NULL; } - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -135,6 +130,10 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } @@ -1167,7 +1166,7 @@ xfs_reclaim_inodes( * them to be cleaned, which we hope will not be very long due to the * background walker having already kicked the IO off on those dirty inodes. */ -void +long xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) @@ -1176,7 +1175,7 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); } /* diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a89f7d791bd..9ed68bb750f5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -42,11 +42,15 @@ struct xfs_eofblocks { int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); +/* recovery needs direct inode allocation capability */ +struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino); +void xfs_inode_free(struct xfs_inode *ip); + void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); -void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c index e011d597f12f..63382d37f565 100644 --- a/fs/xfs/xfs_inode_buf.c +++ b/fs/xfs/xfs_inode_buf.c @@ -53,9 +53,8 @@ xfs_inobp_check( i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, - "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", - bp); - ASSERT(dip->di_next_unlinked); + "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", + i, (long long)bp->b_bn); } } } @@ -106,11 +105,10 @@ xfs_inode_buf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, mp, dip); #ifdef DEBUG - xfs_emerg(mp, + xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", (unsigned long long)bp->b_bn, i, be16_to_cpu(dip->di_magic)); - ASSERT(0); #endif } } @@ -196,7 +194,7 @@ xfs_imap_to_bp( return 0; } -STATIC void +void xfs_dinode_from_disk( xfs_icdinode_t *to, xfs_dinode_t *from) diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h index 599e6c0ca2a9..abba0ae8cf2d 100644 --- a/fs/xfs/xfs_inode_buf.h +++ b/fs/xfs/xfs_inode_buf.h @@ -32,17 +32,17 @@ struct xfs_imap { ushort im_boffset; /* inode offset in block in bytes */ }; -int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, - struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); -void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); -void xfs_dinode_to_disk(struct xfs_dinode *, - struct xfs_icdinode *); +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); +void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); #if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bdebc21078d7..668e8f4ccf5e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -71,7 +71,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f = {0}; + struct fd f = {NULL}; struct path path; int error; struct xfs_inode *ip; @@ -456,12 +456,9 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(al_hreq.buflen); - if (!kbuf) - goto out_dput; - } + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -472,12 +469,9 @@ xfs_attrlist_by_handle( if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) error = -EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); - out_dput: +out_kfree: + kmem_free(kbuf); +out_dput: dput(dentry); return error; } @@ -495,12 +489,9 @@ xfs_attrmulti_attr_get( if (*len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(*len); - if (!kbuf) - return ENOMEM; - } + kbuf = kmem_zalloc_large(*len, KM_SLEEP); + if (!kbuf) + return ENOMEM; error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) @@ -509,11 +500,8 @@ xfs_attrmulti_attr_get( if (copy_to_user(ubuf, kbuf, *len)) error = EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); +out_kfree: + kmem_free(kbuf); return error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index d3ab9534307f..f671f7e472ac 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -371,12 +371,9 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(al_hreq.buflen); - if (!kbuf) - goto out_dput; - } + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -387,12 +384,9 @@ xfs_compat_attrlist_by_handle( if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) error = -EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); - out_dput: +out_kfree: + kmem_free(kbuf); +out_dput: dput(dentry); return error; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b93e14b86754..084b3e1741fd 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -495,7 +495,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free_large(irbuf); + kmem_free(irbuf); *ubcountp = ubelem; /* * Found some inodes, return them now and return the error next time. @@ -541,8 +541,9 @@ xfs_bulkstat_single( * at the expense of the error case. */ - ino = (xfs_ino_t)*lastinop; - error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); + ino = *lastinop; + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), + NULL, &res); if (error) { /* * Special case way failed, do it the "long" way diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5372d58ef93a..a2dea108071a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -257,7 +257,8 @@ xlog_grant_head_wait( struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) { list_add_tail(&tic->t_queue, &head->waiters); diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h index 31e3a06c4644..ca7e28a8ed31 100644 --- a/fs/xfs/xfs_log_format.h +++ b/fs/xfs/xfs_log_format.h @@ -474,6 +474,8 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ +#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ +#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ /* @@ -487,7 +489,8 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT) + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -499,7 +502,8 @@ typedef struct xfs_inode_log_format_64 { XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ XFS_ILOG_DEV | XFS_ILOG_UUID | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) static inline int xfs_ilog_fbroot(int w) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 7c0c1fdc728b..39797490a1f1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1585,6 +1585,7 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); + kmem_free(ptr); return XFS_ERROR(EIO); } @@ -1970,6 +1971,13 @@ xlog_recover_do_inode_buffer( * magic number. If we don't recognise the magic number in the buffer, then * return a LSN of -1 so that the caller knows it was an unrecognised block and * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. */ static xfs_lsn_t xlog_recover_get_buf_lsn( @@ -1980,6 +1988,8 @@ xlog_recover_get_buf_lsn( __uint16_t magic16; __uint16_t magicda; void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; /* v4 filesystems always recover immediately */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -1992,43 +2002,79 @@ xlog_recover_get_buf_lsn( case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: - return be64_to_cpu( - ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn); + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: - return be64_to_cpu( - ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn); + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } case XFS_AGF_MAGIC: - return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; case XFS_AGFL_MAGIC: - return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; case XFS_AGI_MAGIC: - return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; case XFS_SYMLINK_MAGIC: - return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; case XFS_DIR3_BLOCK_MAGIC: case XFS_DIR3_DATA_MAGIC: case XFS_DIR3_FREE_MAGIC: - return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; case XFS_ATTR3_RMT_MAGIC: - return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; case XFS_SB_MAGIC: - return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn); + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; default: break; } + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); switch (magicda) { case XFS_DIR3_LEAF1_MAGIC: case XFS_DIR3_LEAFN_MAGIC: case XFS_DA3_NODE_MAGIC: - return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; default: break; } + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + /* * We do individual object checks on dquot and inode buffers as they * have their own individual LSN records. Also, we could have a stale @@ -2629,6 +2675,82 @@ out_release: return error; } +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + STATIC int xlog_recover_inode_pass2( struct xlog *log, @@ -2681,8 +2803,7 @@ xlog_recover_inode_pass2( error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - xfs_buf_relse(bp); - goto error; + goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2692,30 +2813,31 @@ xlog_recover_inode_pass2( * like an inode! */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } /* * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. */ if (dip->di_version >= 3) { xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); @@ -2723,7 +2845,7 @@ xlog_recover_inode_pass2( if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto out_release; + goto out_owner_change; } } @@ -2745,10 +2867,9 @@ xlog_recover_inode_pass2( dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } @@ -2760,13 +2881,12 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && @@ -2774,19 +2894,17 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", @@ -2794,29 +2912,27 @@ xlog_recover_inode_pass2( dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } isize = xfs_icdinode_size(dicp->di_version); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ @@ -2842,7 +2958,7 @@ xlog_recover_inode_pass2( } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2903,13 +3019,15 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); /* re-generate the checksum. */ xfs_dinode_calc_crc(log->l_mp, dip); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6218a0aeeeea..3e6c2e6c9cd2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -51,8 +51,9 @@ */ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it * currently is the only interface into the radix tree code that allows @@ -203,12 +204,9 @@ xfs_qm_dqpurge( * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ - mutex_lock(&qi->qi_lru_lock); ASSERT(!list_empty(&dqp->q_lru)); - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; + list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); - mutex_unlock(&qi->qi_lru_lock); xfs_qm_dqdestroy(dqp); @@ -680,6 +678,143 @@ xfs_qm_calc_dquots_per_chunk( return ndquots; } +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_del_init(&dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + unsigned long nr_to_scan = sc->nr_to_scan; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, + &nr_to_scan); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_count_node(&qi->qi_lru, sc->nid); +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -696,11 +831,18 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + if ((error = list_lru_init(&qinf->qi_lru))) { + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; + } + /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ if ((error = xfs_qm_init_quotainos(mp))) { + list_lru_destroy(&qinf->qi_lru); kmem_free(qinf); mp->m_quotainfo = NULL; return error; @@ -711,10 +853,6 @@ xfs_qm_init_quotainfo( INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); mutex_init(&qinf->qi_tree_lock); - INIT_LIST_HEAD(&qinf->qi_lru_list); - qinf->qi_lru_count = 0; - mutex_init(&qinf->qi_lru_lock); - /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); @@ -779,8 +917,10 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } - qinf->qi_shrinker.shrink = xfs_qm_shake; + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; } @@ -801,6 +941,7 @@ xfs_qm_destroy_quotainfo( ASSERT(qi != NULL); unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1599,132 +1740,6 @@ xfs_qm_dqfree_one( xfs_qm_dqdestroy(dqp); } -STATIC void -xfs_qm_dqreclaim_one( - struct xfs_dquot *dqp, - struct list_head *buffer_list, - struct list_head *dispose_list) -{ - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - int error; - - if (!xfs_dqlock_nowait(dqp)) - goto out_move_tail; - - /* - * This dquot has acquired a reference in the meantime remove it from - * the freelist and try again. - */ - if (dqp->q_nrefs) { - xfs_dqunlock(dqp); - - trace_xfs_dqreclaim_want(dqp); - XFS_STATS_INC(xs_qm_dqwants); - - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - return; - } - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto out_unlock_move_tail; - - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - - trace_xfs_dqreclaim_dirty(dqp); - - error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - goto out_unlock_move_tail; - } - - xfs_buf_delwri_queue(bp, buffer_list); - xfs_buf_relse(bp); - /* - * Give the dquot another try on the freelist, as the - * flushing will take some time. - */ - goto out_unlock_move_tail; - } - xfs_dqfunlock(dqp); - - /* - * Prevent lookups now that we are past the point of no return. - */ - dqp->dq_flags |= XFS_DQ_FREEING; - xfs_dqunlock(dqp); - - ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, dispose_list); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - - trace_xfs_dqreclaim_done(dqp); - XFS_STATS_INC(xs_qm_dqreclaims); - return; - - /* - * Move the dquot to the tail of the list so that we don't spin on it. - */ -out_unlock_move_tail: - xfs_dqunlock(dqp); -out_move_tail: - list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); -} - -STATIC int -xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) -{ - struct xfs_quotainfo *qi = - container_of(shrink, struct xfs_quotainfo, qi_shrinker); - int nr_to_scan = sc->nr_to_scan; - LIST_HEAD (buffer_list); - LIST_HEAD (dispose_list); - struct xfs_dquot *dqp; - int error; - - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) - return 0; - if (!nr_to_scan) - goto out; - - mutex_lock(&qi->qi_lru_lock); - while (!list_empty(&qi->qi_lru_list)) { - if (nr_to_scan-- <= 0) - break; - dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, - q_lru); - xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); - } - mutex_unlock(&qi->qi_lru_lock); - - error = xfs_buf_delwri_submit(&buffer_list); - if (error) - xfs_warn(NULL, "%s: dquot reclaim failed", __func__); - - while (!list_empty(&dispose_list)) { - dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); - list_del_init(&dqp->q_lru); - xfs_qm_dqfree_one(dqp); - } - -out: - return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; -} - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 670cd4464070..2b602df9c242 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -49,9 +49,7 @@ typedef struct xfs_quotainfo { struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ - struct list_head qi_lru_list; - struct mutex qi_lru_lock; - int qi_lru_count; + struct list_lru qi_lru; int qi_dquots; time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 979a77d4b87d..15188cc99449 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1535,19 +1535,21 @@ xfs_fs_mount( return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } -static int +static long xfs_fs_nr_cached_objects( - struct super_block *sb) + struct super_block *sb, + int nid) { return xfs_reclaim_inodes_count(XFS_M(sb)); } -static void +static long xfs_fs_free_cached_objects( struct super_block *sb, - int nr_to_scan) + long nr_to_scan, + int nid) { - xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); } static const struct super_operations xfs_super_operations = { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 2f2a7c005be2..f622a97a7e33 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -41,6 +41,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_symlink.h" +#include "xfs_buf_item.h" /* ----- Kernel only functions below ----- */ STATIC int @@ -363,6 +364,7 @@ xfs_symlink( pathlen -= byte_cnt; offset += byte_cnt; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } |