diff options
Diffstat (limited to 'fs/xfs')
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 13 | ||||
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 8 | ||||
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.h | 1 | ||||
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_ioctl.c | 7 | ||||
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_iops.c | 2 | ||||
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_super.c | 9 | ||||
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 42 | ||||
| -rw-r--r-- | fs/xfs/xfs_bmap.c | 14 | ||||
| -rw-r--r-- | fs/xfs/xfs_fs.h | 4 | ||||
| -rw-r--r-- | fs/xfs/xfs_fsops.c | 31 | ||||
| -rw-r--r-- | fs/xfs/xfs_fsops.h | 2 | ||||
| -rw-r--r-- | fs/xfs/xfs_ialloc.c | 16 | ||||
| -rw-r--r-- | fs/xfs/xfs_inode.c | 49 | ||||
| -rw-r--r-- | fs/xfs/xfs_log.c | 7 | ||||
| -rw-r--r-- | fs/xfs/xfs_log_cil.c | 263 | ||||
| -rw-r--r-- | fs/xfs/xfs_log_priv.h | 13 | ||||
| -rw-r--r-- | fs/xfs/xfs_trans.c | 5 | ||||
| -rw-r--r-- | fs/xfs/xfs_trans_priv.h | 3 | ||||
| -rw-r--r-- | fs/xfs/xfs_vnodeops.c | 13 | 
19 files changed, 286 insertions, 216 deletions
| diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 15412fe15c3a..b552f816de15 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -852,8 +852,8 @@ xfs_convert_page(  		SetPageUptodate(page);  	if (count) { -		wbc->nr_to_write--; -		if (wbc->nr_to_write <= 0) +		if (--wbc->nr_to_write <= 0 && +		    wbc->sync_mode == WB_SYNC_NONE)  			done = 1;  	}  	xfs_start_page_writeback(page, !page_dirty, count); @@ -1068,7 +1068,7 @@ xfs_vm_writepage(  	 * by themselves.  	 */  	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) -		goto out_fail; +		goto redirty;  	/*  	 * We need a transaction if there are delalloc or unwritten buffers @@ -1080,7 +1080,7 @@ xfs_vm_writepage(  	 */  	xfs_count_page_state(page, &delalloc, &unwritten);  	if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) -		goto out_fail; +		goto redirty;  	/* Is this page beyond the end of the file? */  	offset = i_size_read(inode); @@ -1245,12 +1245,15 @@ error:  	if (iohead)  		xfs_cancel_ioend(iohead); +	if (err == -EAGAIN) +		goto redirty; +  	xfs_aops_discard_page(page);  	ClearPageUptodate(page);  	unlock_page(page);  	return err; -out_fail: +redirty:  	redirty_page_for_writepage(wbc, page);  	unlock_page(page);  	return 0; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ea79072f5210..d72cf2bb054a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -440,12 +440,7 @@ _xfs_buf_find(  		ASSERT(btp == bp->b_target);  		if (bp->b_file_offset == range_base &&  		    bp->b_buffer_length == range_length) { -			/* -			 * If we look at something, bring it to the -			 * front of the list for next time. -			 */  			atomic_inc(&bp->b_hold); -			list_move(&bp->b_hash_list, &hash->bh_list);  			goto found;  		}  	} @@ -1443,8 +1438,7 @@ xfs_alloc_bufhash(  {  	unsigned int		i; -	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */ -	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; +	btp->bt_hashshift = external ? 3 : 12;	/* 8 or 4096 buckets */  	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *  					 sizeof(xfs_bufhash_t));  	for (i = 0; i < (1 << btp->bt_hashshift); i++) { diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index d072e5ff923b..2a05614f0b92 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -137,7 +137,6 @@ typedef struct xfs_buftarg {  	size_t			bt_smask;  	/* per device buffer hash table */ -	uint			bt_hashmask;  	uint			bt_hashshift;  	xfs_bufhash_t		*bt_hash; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 237f5ffb2ee8..4fec427b83ef 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -907,6 +907,13 @@ xfs_ioctl_setattr(  		return XFS_ERROR(EIO);  	/* +	 * Disallow 32bit project ids because on-disk structure +	 * is 16bit only. +	 */ +	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) +		return XFS_ERROR(EINVAL); + +	/*  	 * If disk quotas is on, we make sure that the dquots do exist on disk,  	 * before we start any other transactions. Trying to do this later  	 * is messy. We don't care to take a readlock to look at the ids diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 68be25dcd301..b1fc2a6bfe83 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -664,7 +664,7 @@ xfs_vn_fiemap(  					fieinfo->fi_extents_max + 1;  	bm.bmv_count = min_t(__s32, bm.bmv_count,  			     (PAGE_SIZE * 16 / sizeof(struct getbmapx))); -	bm.bmv_iflags = BMV_IF_PREALLOC; +	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;  	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)  		bm.bmv_iflags |= BMV_IF_ATTRFORK;  	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 15c35b62ff14..a4e07974955b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1226,6 +1226,7 @@ xfs_fs_statfs(  	struct xfs_inode	*ip = XFS_I(dentry->d_inode);  	__uint64_t		fakeinos, id;  	xfs_extlen_t		lsize; +	__int64_t		ffree;  	statp->f_type = XFS_SB_MAGIC;  	statp->f_namelen = MAXNAMELEN - 1; @@ -1249,7 +1250,11 @@ xfs_fs_statfs(  		statp->f_files = min_t(typeof(statp->f_files),  					statp->f_files,  					mp->m_maxicount); -	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + +	/* make sure statp->f_ffree does not underflow */ +	ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); +	statp->f_ffree = max_t(__int64_t, ffree, 0); +  	spin_unlock(&mp->m_sb_lock);  	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || @@ -1402,7 +1407,7 @@ xfs_fs_freeze(  	xfs_save_resvblks(mp);  	xfs_quiesce_attr(mp); -	return -xfs_fs_log_dummy(mp); +	return -xfs_fs_log_dummy(mp, SYNC_WAIT);  }  STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index dfcbd98d1599..d59c4a65d492 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -34,6 +34,7 @@  #include "xfs_inode_item.h"  #include "xfs_quota.h"  #include "xfs_trace.h" +#include "xfs_fsops.h"  #include <linux/kthread.h>  #include <linux/freezer.h> @@ -341,38 +342,6 @@ xfs_sync_attr(  }  STATIC int -xfs_commit_dummy_trans( -	struct xfs_mount	*mp, -	uint			flags) -{ -	struct xfs_inode	*ip = mp->m_rootip; -	struct xfs_trans	*tp; -	int			error; - -	/* -	 * Put a dummy transaction in the log to tell recovery -	 * that all others are OK. -	 */ -	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); -	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); -	if (error) { -		xfs_trans_cancel(tp, 0); -		return error; -	} - -	xfs_ilock(ip, XFS_ILOCK_EXCL); - -	xfs_trans_ijoin(tp, ip); -	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -	error = xfs_trans_commit(tp, 0); -	xfs_iunlock(ip, XFS_ILOCK_EXCL); - -	/* the log force ensures this transaction is pushed to disk */ -	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); -	return error; -} - -STATIC int  xfs_sync_fsdata(  	struct xfs_mount	*mp)  { @@ -432,7 +401,7 @@ xfs_quiesce_data(  	/* mark the log as covered if needed */  	if (xfs_log_need_covered(mp)) -		error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); +		error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);  	/* flush data-only devices */  	if (mp->m_rtdev_targp) @@ -563,7 +532,7 @@ xfs_flush_inodes(  /*   * Every sync period we need to unpin all items, reclaim inodes and sync   * disk quotas.  We might need to cover the log to indicate that the - * filesystem is idle. + * filesystem is idle and not frozen.   */  STATIC void  xfs_sync_worker( @@ -577,8 +546,9 @@ xfs_sync_worker(  		xfs_reclaim_inodes(mp, 0);  		/* dgc: errors ignored here */  		error = xfs_qm_sync(mp, SYNC_TRYLOCK); -		if (xfs_log_need_covered(mp)) -			error = xfs_commit_dummy_trans(mp, 0); +		if (mp->m_super->s_frozen == SB_UNFROZEN && +		    xfs_log_need_covered(mp)) +			error = xfs_fs_log_dummy(mp, 0);  	}  	mp->m_sync_seq++;  	wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 23f14e595c18..f90dadd5a968 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5533,12 +5533,24 @@ xfs_getbmap(  					map[i].br_startblock))  				goto out_free_map; -			nexleft--;  			bmv->bmv_offset =  				out[cur_ext].bmv_offset +  				out[cur_ext].bmv_length;  			bmv->bmv_length =  				max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + +			/* +			 * In case we don't want to return the hole, +			 * don't increase cur_ext so that we can reuse +			 * it in the next loop. +			 */ +			if ((iflags & BMV_IF_NO_HOLES) && +			    map[i].br_startblock == HOLESTARTBLOCK) { +				memset(&out[cur_ext], 0, sizeof(out[cur_ext])); +				continue; +			} + +			nexleft--;  			bmv->bmv_entries++;  			cur_ext++;  		} diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 7cf7220e7d5f..87c2e9d02288 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -114,8 +114,10 @@ struct getbmapx {  #define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */  #define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */  #define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES		0x10	/* Do not return holes */  #define BMV_IF_VALID	\ -	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) +	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\ +	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)  /*	bmv_oflags values - returned for each non-header segment */  #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index dbca5f5c37ba..43b1d5699335 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -604,31 +604,36 @@ out:  	return 0;  } +/* + * Dump a transaction into the log that contains no real change. This is needed + * to be able to make the log dirty or stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty state back up + * into the VFS and then periodic inode flushing will prevent log covering from + * making progress. Hence we log a field in the superblock instead. + */  int  xfs_fs_log_dummy( -	xfs_mount_t	*mp) +	xfs_mount_t	*mp, +	int		flags)  {  	xfs_trans_t	*tp; -	xfs_inode_t	*ip;  	int		error;  	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); -	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); +	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, +					XFS_DEFAULT_LOG_COUNT);  	if (error) {  		xfs_trans_cancel(tp, 0);  		return error;  	} -	ip = mp->m_rootip; -	xfs_ilock(ip, XFS_ILOCK_EXCL); - -	xfs_trans_ijoin(tp, ip); -	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -	xfs_trans_set_sync(tp); -	error = xfs_trans_commit(tp, 0); - -	xfs_iunlock(ip, XFS_ILOCK_EXCL); -	return error; +	/* log the UUID because it is an unchanging field */ +	xfs_mod_sb(tp, XFS_SB_UUID); +	if (flags & SYNC_WAIT) +		xfs_trans_set_sync(tp); +	return xfs_trans_commit(tp, 0);  }  int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 88435e0a77c9..a786c5212c1e 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);  extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,  				xfs_fsop_resblks_t *outval);  extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);  #endif	/* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index abf80ae1e95b..5371d2dc360e 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1213,7 +1213,6 @@ xfs_imap_lookup(  	struct xfs_inobt_rec_incore rec;  	struct xfs_btree_cur	*cur;  	struct xfs_buf		*agbp; -	xfs_agino_t		startino;  	int			error;  	int			i; @@ -1227,13 +1226,13 @@ xfs_imap_lookup(  	}  	/* -	 * derive and lookup the exact inode record for the given agino. If the -	 * record cannot be found, then it's an invalid inode number and we -	 * should abort. +	 * Lookup the inode record for the given agino. If the record cannot be +	 * found, then it's an invalid inode number and we should abort. Once +	 * we have a record, we need to ensure it contains the inode number +	 * we are looking up.  	 */  	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); -	startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); -	error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); +	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);  	if (!error) {  		if (i)  			error = xfs_inobt_get_rec(cur, &rec, &i); @@ -1246,6 +1245,11 @@ xfs_imap_lookup(  	if (error)  		return error; +	/* check that the returned record contains the required inode */ +	if (rec.ir_startino > agino || +	    rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) +		return EINVAL; +  	/* for untrusted inodes check it is allocated first */  	if ((flags & XFS_IGET_UNTRUSTED) &&  	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 68415cb4f23c..34798f391c49 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1914,6 +1914,11 @@ xfs_iunlink_remove(  	return 0;  } +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */  STATIC void  xfs_ifree_cluster(  	xfs_inode_t	*free_ip, @@ -1945,8 +1950,6 @@ xfs_ifree_cluster(  	}  	for (j = 0; j < nbufs; j++, inum += ninodes) { -		int	found = 0; -  		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),  					 XFS_INO_TO_AGBNO(mp, inum)); @@ -1965,7 +1968,9 @@ xfs_ifree_cluster(  		/*  		 * Walk the inodes already attached to the buffer and mark them  		 * stale. These will all have the flush locks held, so an -		 * in-memory inode walk can't lock them. +		 * in-memory inode walk can't lock them. By marking them all +		 * stale first, we will not attempt to lock them in the loop +		 * below as the XFS_ISTALE flag will be set.  		 */  		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);  		while (lip) { @@ -1977,11 +1982,11 @@ xfs_ifree_cluster(  							&iip->ili_flush_lsn,  							&iip->ili_item.li_lsn);  				xfs_iflags_set(iip->ili_inode, XFS_ISTALE); -				found++;  			}  			lip = lip->li_bio_list;  		} +  		/*  		 * For each inode in memory attempt to add it to the inode  		 * buffer and set it up for being staled on buffer IO @@ -1993,6 +1998,7 @@ xfs_ifree_cluster(  		 * even trying to lock them.  		 */  		for (i = 0; i < ninodes; i++) { +retry:  			read_lock(&pag->pag_ici_lock);  			ip = radix_tree_lookup(&pag->pag_ici_root,  					XFS_INO_TO_AGINO(mp, (inum + i))); @@ -2003,38 +2009,36 @@ xfs_ifree_cluster(  				continue;  			} -			/* don't try to lock/unlock the current inode */ +			/* +			 * Don't try to lock/unlock the current inode, but we +			 * _cannot_ skip the other inodes that we did not find +			 * in the list attached to the buffer and are not +			 * already marked stale. If we can't lock it, back off +			 * and retry. +			 */  			if (ip != free_ip &&  			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {  				read_unlock(&pag->pag_ici_lock); -				continue; +				delay(1); +				goto retry;  			}  			read_unlock(&pag->pag_ici_lock); -			if (!xfs_iflock_nowait(ip)) { -				if (ip != free_ip) -					xfs_iunlock(ip, XFS_ILOCK_EXCL); -				continue; -			} - +			xfs_iflock(ip);  			xfs_iflags_set(ip, XFS_ISTALE); -			if (xfs_inode_clean(ip)) { -				ASSERT(ip != free_ip); -				xfs_ifunlock(ip); -				xfs_iunlock(ip, XFS_ILOCK_EXCL); -				continue; -			} +			/* +			 * we don't need to attach clean inodes or those only +			 * with unlogged changes (which we throw away, anyway). +			 */  			iip = ip->i_itemp; -			if (!iip) { -				/* inode with unlogged changes only */ +			if (!iip || xfs_inode_clean(ip)) {  				ASSERT(ip != free_ip);  				ip->i_update_core = 0;  				xfs_ifunlock(ip);  				xfs_iunlock(ip, XFS_ILOCK_EXCL);  				continue;  			} -			found++;  			iip->ili_last_fields = iip->ili_format.ilf_fields;  			iip->ili_format.ilf_fields = 0; @@ -2049,8 +2053,7 @@ xfs_ifree_cluster(  				xfs_iunlock(ip, XFS_ILOCK_EXCL);  		} -		if (found) -			xfs_trans_stale_inode_buf(tp, bp); +		xfs_trans_stale_inode_buf(tp, bp);  		xfs_trans_binval(tp, bp);  	} diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 925d572bf0f4..33f718f92a48 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3015,7 +3015,8 @@ _xfs_log_force(  	XFS_STATS_INC(xs_log_force); -	xlog_cil_push(log, 1); +	if (log->l_cilp) +		xlog_cil_force(log);  	spin_lock(&log->l_icloglock); @@ -3167,7 +3168,7 @@ _xfs_log_force_lsn(  	XFS_STATS_INC(xs_log_force);  	if (log->l_cilp) { -		lsn = xlog_cil_push_lsn(log, lsn); +		lsn = xlog_cil_force_lsn(log, lsn);  		if (lsn == NULLCOMMITLSN)  			return 0;  	} @@ -3724,7 +3725,7 @@ xfs_log_force_umount(  	 * call below.  	 */  	if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) -		xlog_cil_push(log, 1); +		xlog_cil_force(log);  	/*  	 * We must hold both the GRANT lock and the LOG lock, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 31e4ea2d19ac..ed575fb4b495 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -68,6 +68,7 @@ xlog_cil_init(  	ctx->sequence = 1;  	ctx->cil = cil;  	cil->xc_ctx = ctx; +	cil->xc_current_sequence = ctx->sequence;  	cil->xc_log = log;  	log->l_cilp = cil; @@ -269,15 +270,10 @@ xlog_cil_insert(  static void  xlog_cil_format_items(  	struct log		*log, -	struct xfs_log_vec	*log_vector, -	struct xlog_ticket	*ticket, -	xfs_lsn_t		*start_lsn) +	struct xfs_log_vec	*log_vector)  {  	struct xfs_log_vec *lv; -	if (start_lsn) -		*start_lsn = log->l_cilp->xc_ctx->sequence; -  	ASSERT(log_vector);  	for (lv = log_vector; lv; lv = lv->lv_next) {  		void	*ptr; @@ -301,9 +297,24 @@ xlog_cil_format_items(  			ptr += vec->i_len;  		}  		ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); +	} +} + +static void +xlog_cil_insert_items( +	struct log		*log, +	struct xfs_log_vec	*log_vector, +	struct xlog_ticket	*ticket, +	xfs_lsn_t		*start_lsn) +{ +	struct xfs_log_vec *lv; + +	if (start_lsn) +		*start_lsn = log->l_cilp->xc_ctx->sequence; +	ASSERT(log_vector); +	for (lv = log_vector; lv; lv = lv->lv_next)  		xlog_cil_insert(log, ticket, lv->lv_item, lv); -	}  }  static void @@ -321,80 +332,6 @@ xlog_cil_free_logvec(  }  /* - * Commit a transaction with the given vector to the Committed Item List. - * - * To do this, we need to format the item, pin it in memory if required and - * account for the space used by the transaction. Once we have done that we - * need to release the unused reservation for the transaction, attach the - * transaction to the checkpoint context so we carry the busy extents through - * to checkpoint completion, and then unlock all the items in the transaction. - * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * - * Called with the context lock already held in read mode to lock out - * background commit, returns without it held once background commits are - * allowed again. - */ -int -xfs_log_commit_cil( -	struct xfs_mount	*mp, -	struct xfs_trans	*tp, -	struct xfs_log_vec	*log_vector, -	xfs_lsn_t		*commit_lsn, -	int			flags) -{ -	struct log		*log = mp->m_log; -	int			log_flags = 0; -	int			push = 0; - -	if (flags & XFS_TRANS_RELEASE_LOG_RES) -		log_flags = XFS_LOG_REL_PERM_RESERV; - -	if (XLOG_FORCED_SHUTDOWN(log)) { -		xlog_cil_free_logvec(log_vector); -		return XFS_ERROR(EIO); -	} - -	/* lock out background commit */ -	down_read(&log->l_cilp->xc_ctx_lock); -	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); - -	/* check we didn't blow the reservation */ -	if (tp->t_ticket->t_curr_res < 0) -		xlog_print_tic_res(log->l_mp, tp->t_ticket); - -	/* attach the transaction to the CIL if it has any busy extents */ -	if (!list_empty(&tp->t_busy)) { -		spin_lock(&log->l_cilp->xc_cil_lock); -		list_splice_init(&tp->t_busy, -					&log->l_cilp->xc_ctx->busy_extents); -		spin_unlock(&log->l_cilp->xc_cil_lock); -	} - -	tp->t_commit_lsn = *commit_lsn; -	xfs_log_done(mp, tp->t_ticket, NULL, log_flags); -	xfs_trans_unreserve_and_mod_sb(tp); - -	/* check for background commit before unlock */ -	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) -		push = 1; -	up_read(&log->l_cilp->xc_ctx_lock); - -	/* -	 * We need to push CIL every so often so we don't cache more than we -	 * can fit in the log. The limit really is that a checkpoint can't be -	 * more than half the log (the current checkpoint is not allowed to -	 * overwrite the previous checkpoint), but commit latency and memory -	 * usage limit this to a smaller size in most cases. -	 */ -	if (push) -		xlog_cil_push(log, 0); -	return 0; -} - -/*   * Mark all items committed and clear busy extents. We free the log vector   * chains in a separate pass so that we unpin the log items as quickly as   * possible. @@ -427,13 +364,23 @@ xlog_cil_committed(  }  /* - * Push the Committed Item List to the log. If the push_now flag is not set, - * then it is a background flush and so we can chose to ignore it. + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes.   */ -int +STATIC int  xlog_cil_push(  	struct log		*log, -	int			push_now) +	xfs_lsn_t		push_seq)  {  	struct xfs_cil		*cil = log->l_cilp;  	struct xfs_log_vec	*lv; @@ -453,12 +400,14 @@ xlog_cil_push(  	if (!cil)  		return 0; +	ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); +  	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);  	new_ctx->ticket = xlog_cil_ticket_alloc(log);  	/* lock out transaction commit, but don't block on background push */  	if (!down_write_trylock(&cil->xc_ctx_lock)) { -		if (!push_now) +		if (!push_seq)  			goto out_free_ticket;  		down_write(&cil->xc_ctx_lock);  	} @@ -469,7 +418,11 @@ xlog_cil_push(  		goto out_skip;  	/* check for spurious background flush */ -	if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) +	if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) +		goto out_skip; + +	/* check for a previously pushed seqeunce */ +	if (push_seq < cil->xc_ctx->sequence)  		goto out_skip;  	/* @@ -515,6 +468,13 @@ xlog_cil_push(  	cil->xc_ctx = new_ctx;  	/* +	 * mirror the new sequence into the cil structure so that we can do +	 * unlocked checks against the current sequence in log forces without +	 * risking deferencing a freed context pointer. +	 */ +	cil->xc_current_sequence = new_ctx->sequence; + +	/*  	 * The switch is now done, so we can drop the context lock and move out  	 * of a shared context. We can't just go straight to the commit record,  	 * though - we need to synchronise with previous and future commits so @@ -626,6 +586,102 @@ out_abort:  }  /* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( +	struct xfs_mount	*mp, +	struct xfs_trans	*tp, +	struct xfs_log_vec	*log_vector, +	xfs_lsn_t		*commit_lsn, +	int			flags) +{ +	struct log		*log = mp->m_log; +	int			log_flags = 0; +	int			push = 0; + +	if (flags & XFS_TRANS_RELEASE_LOG_RES) +		log_flags = XFS_LOG_REL_PERM_RESERV; + +	if (XLOG_FORCED_SHUTDOWN(log)) { +		xlog_cil_free_logvec(log_vector); +		return XFS_ERROR(EIO); +	} + +	/* +	 * do all the hard work of formatting items (including memory +	 * allocation) outside the CIL context lock. This prevents stalling CIL +	 * pushes when we are low on memory and a transaction commit spends a +	 * lot of time in memory reclaim. +	 */ +	xlog_cil_format_items(log, log_vector); + +	/* lock out background commit */ +	down_read(&log->l_cilp->xc_ctx_lock); +	xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); + +	/* check we didn't blow the reservation */ +	if (tp->t_ticket->t_curr_res < 0) +		xlog_print_tic_res(log->l_mp, tp->t_ticket); + +	/* attach the transaction to the CIL if it has any busy extents */ +	if (!list_empty(&tp->t_busy)) { +		spin_lock(&log->l_cilp->xc_cil_lock); +		list_splice_init(&tp->t_busy, +					&log->l_cilp->xc_ctx->busy_extents); +		spin_unlock(&log->l_cilp->xc_cil_lock); +	} + +	tp->t_commit_lsn = *commit_lsn; +	xfs_log_done(mp, tp->t_ticket, NULL, log_flags); +	xfs_trans_unreserve_and_mod_sb(tp); + +	/* +	 * Once all the items of the transaction have been copied to the CIL, +	 * the items can be unlocked and freed. +	 * +	 * This needs to be done before we drop the CIL context lock because we +	 * have to update state in the log items and unlock them before they go +	 * to disk. If we don't, then the CIL checkpoint can race with us and +	 * we can run checkpoint completion before we've updated and unlocked +	 * the log items. This affects (at least) processing of stale buffers, +	 * inodes and EFIs. +	 */ +	xfs_trans_free_items(tp, *commit_lsn, 0); + +	/* check for background commit before unlock */ +	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) +		push = 1; + +	up_read(&log->l_cilp->xc_ctx_lock); + +	/* +	 * We need to push CIL every so often so we don't cache more than we +	 * can fit in the log. The limit really is that a checkpoint can't be +	 * more than half the log (the current checkpoint is not allowed to +	 * overwrite the previous checkpoint), but commit latency and memory +	 * usage limit this to a smaller size in most cases. +	 */ +	if (push) +		xlog_cil_push(log, 0); +	return 0; +} + +/*   * Conditionally push the CIL based on the sequence passed in.   *   * We only need to push if we haven't already pushed the sequence @@ -639,39 +695,34 @@ out_abort:   * commit lsn is there. It'll be empty, so this is broken for now.   */  xfs_lsn_t -xlog_cil_push_lsn( +xlog_cil_force_lsn(  	struct log	*log, -	xfs_lsn_t	push_seq) +	xfs_lsn_t	sequence)  {  	struct xfs_cil		*cil = log->l_cilp;  	struct xfs_cil_ctx	*ctx;  	xfs_lsn_t		commit_lsn = NULLCOMMITLSN; -restart: -	down_write(&cil->xc_ctx_lock); -	ASSERT(push_seq <= cil->xc_ctx->sequence); - -	/* check to see if we need to force out the current context */ -	if (push_seq == cil->xc_ctx->sequence) { -		up_write(&cil->xc_ctx_lock); -		xlog_cil_push(log, 1); -		goto restart; -	} +	ASSERT(sequence <= cil->xc_current_sequence); + +	/* +	 * check to see if we need to force out the current context. +	 * xlog_cil_push() handles racing pushes for the same sequence, +	 * so no need to deal with it here. +	 */ +	if (sequence == cil->xc_current_sequence) +		xlog_cil_push(log, sequence);  	/*  	 * See if we can find a previous sequence still committing. -	 * We can drop the flush lock as soon as we have the cil lock -	 * because we are now only comparing contexts protected by -	 * the cil lock. -	 *  	 * We need to wait for all previous sequence commits to complete  	 * before allowing the force of push_seq to go ahead. Hence block  	 * on commits for those as well.  	 */ +restart:  	spin_lock(&cil->xc_cil_lock); -	up_write(&cil->xc_ctx_lock);  	list_for_each_entry(ctx, &cil->xc_committing, committing) { -		if (ctx->sequence > push_seq) +		if (ctx->sequence > sequence)  			continue;  		if (!ctx->commit_lsn) {  			/* @@ -681,7 +732,7 @@ restart:  			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);  			goto restart;  		} -		if (ctx->sequence != push_seq) +		if (ctx->sequence != sequence)  			continue;  		/* found it! */  		commit_lsn = ctx->commit_lsn; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8c072618965c..ced52b98b322 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -422,6 +422,7 @@ struct xfs_cil {  	struct rw_semaphore	xc_ctx_lock;  	struct list_head	xc_committing;  	sv_t			xc_commit_wait; +	xfs_lsn_t		xc_current_sequence;  };  /* @@ -562,8 +563,16 @@ int	xlog_cil_init(struct log *log);  void	xlog_cil_init_post_recovery(struct log *log);  void	xlog_cil_destroy(struct log *log); -int	xlog_cil_push(struct log *log, int push_now); -xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); +/* + * CIL force routines + */ +xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct log *log) +{ +	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +}  /*   * Unmount record type is used as a pseudo transaction type for the ticket. diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdca7416c754..1c47edaea0d2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1167,7 +1167,7 @@ xfs_trans_del_item(   * Unlock all of the items of a transaction and free all the descriptors   * of that transaction.   */ -STATIC void +void  xfs_trans_free_items(  	struct xfs_trans	*tp,  	xfs_lsn_t		commit_lsn, @@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(  		return error;  	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - -	/* xfs_trans_free_items() unlocks them first */ -	xfs_trans_free_items(tp, *commit_lsn, 0);  	xfs_trans_free(tp);  	return 0;  } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index e2d93d8ead7b..62da86c90de5 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -25,7 +25,8 @@ struct xfs_trans;  void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);  void	xfs_trans_del_item(struct xfs_log_item *); - +void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, +				int flags);  void	xfs_trans_item_committed(struct xfs_log_item *lip,  				xfs_lsn_t commit_lsn, int aborted);  void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 66d585c6917c..4c7c7bfb2b2f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2299,15 +2299,22 @@ xfs_alloc_file_space(  			e = allocatesize_fsb;  		} +		/* +		 * The transaction reservation is limited to a 32-bit block +		 * count, hence we need to limit the number of blocks we are +		 * trying to reserve to avoid an overflow. We can't allocate +		 * more than @nimaps extents, and an extent is limited on disk +		 * to MAXEXTLEN (21 bits), so use that to enforce the limit. +		 */ +		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));  		if (unlikely(rt)) { -			resrtextents = qblocks = (uint)(e - s); +			resrtextents = qblocks = resblks;  			resrtextents /= mp->m_sb.sb_rextsize;  			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);  			quota_flag = XFS_QMOPT_RES_RTBLKS;  		} else {  			resrtextents = 0; -			resblks = qblocks = \ -				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); +			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);  			quota_flag = XFS_QMOPT_RES_REGBLKS;  		} | 
