1 files changed, 203 insertions, 138 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a3a4b5aaf5d9..c6108971b4e6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1,39 +1,26 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.	 Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
 #include "xfs.h"
-#include "xfs_inum.h"
+#include "xfs_bit.h"
 #include "xfs_log.h"
+#include "xfs_inum.h"
 #include "xfs_sb.h"
+#include "xfs_ag.h"
 #include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
@@ -42,13 +29,13 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
@@ -104,66 +91,114 @@ xfs_page_trace(
 #define xfs_page_trace(tag, inode, page, mask)
 #endif
 
-void
-linvfs_unwritten_done(
-	struct buffer_head	*bh,
-	int			uptodate)
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend.
+ */
+STATIC void
+xfs_finish_ioend(
+	xfs_ioend_t		*ioend)
 {
-	xfs_buf_t		*pb = (xfs_buf_t *)bh->b_private;
+	if (atomic_dec_and_test(&ioend->io_remaining))
+		queue_work(xfsdatad_workqueue, &ioend->io_work);
+}
 
-	ASSERT(buffer_unwritten(bh));
-	bh->b_end_io = NULL;
-	clear_buffer_unwritten(bh);
-	if (!uptodate)
-		pagebuf_ioerror(pb, EIO);
-	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-		pagebuf_iodone(pb, 1, 1);
-	}
-	end_buffer_async_write(bh, uptodate);
+STATIC void
+xfs_destroy_ioend(
+	xfs_ioend_t		*ioend)
+{
+	vn_iowake(ioend->io_vnode);
+	mempool_free(ioend, xfs_ioend_pool);
 }
 
 /*
  * Issue transactions to convert a buffer range from unwritten
- * to written extents (buffered IO).
+ * to written extents.
  */
 STATIC void
-linvfs_unwritten_convert(
-	xfs_buf_t	*bp)
+xfs_end_bio_unwritten(
+	void			*data)
 {
-	vnode_t		*vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
-	int		error;
+	xfs_ioend_t		*ioend = data;
+	vnode_t			*vp = ioend->io_vnode;
+	xfs_off_t		offset = ioend->io_offset;
+	size_t			size = ioend->io_size;
+	struct buffer_head	*bh, *next;
+	int			error;
+
+	if (ioend->io_uptodate)
+		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+
+	/* ioend->io_buffer_head is only non-NULL for buffered I/O */
+	for (bh = ioend->io_buffer_head; bh; bh = next) {
+		next = bh->b_private;
 
-	BUG_ON(atomic_read(&bp->pb_hold) < 1);
-	VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
-			BMAPI_UNWRITTEN, NULL, NULL, error);
-	XFS_BUF_SET_FSPRIVATE(bp, NULL);
-	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_UNDATAIO(bp);
-	iput(LINVFS_GET_IP(vp));
-	pagebuf_iodone(bp, 0, 0);
+		bh->b_end_io = NULL;
+		clear_buffer_unwritten(bh);
+		end_buffer_async_write(bh, ioend->io_uptodate);
+	}
+
+	xfs_destroy_ioend(ioend);
 }
 
 /*
- * Issue transactions to convert a buffer range from unwritten
- * to written extents (direct IO).
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
  */
-STATIC void
-linvfs_unwritten_convert_direct(
-	struct kiocb	*iocb,
-	loff_t		offset,
-	ssize_t		size,
-	void		*private)
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+	struct inode		*inode)
 {
-	struct inode	*inode = iocb->ki_filp->f_dentry->d_inode;
-	ASSERT(!private || inode == (struct inode *)private);
+	xfs_ioend_t		*ioend;
 
-	/* private indicates an unwritten extent lay beneath this IO */
-	if (private && size > 0) {
-		vnode_t	*vp = LINVFS_GET_VP(inode);
-		int	error;
+	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
 
-		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
-	}
+	/*
+	 * Set the count to 1 initially, which will prevent an I/O
+	 * completion callback from happening before we have started
+	 * all the I/O from calling the completion routine too early.
+	 */
+	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_uptodate = 1; /* cleared if any I/O fails */
+	ioend->io_vnode = LINVFS_GET_VP(inode);
+	ioend->io_buffer_head = NULL;
+	atomic_inc(&ioend->io_vnode->v_iocount);
+	ioend->io_offset = 0;
+	ioend->io_size = 0;
+
+	INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
+
+	return ioend;
+}
+
+void
+linvfs_unwritten_done(
+	struct buffer_head	*bh,
+	int			uptodate)
+{
+	xfs_ioend_t		*ioend = bh->b_private;
+	static spinlock_t	unwritten_done_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+
+	ASSERT(buffer_unwritten(bh));
+	bh->b_end_io = NULL;
+
+	if (!uptodate)
+		ioend->io_uptodate = 0;
+
+	/*
+	 * Deep magic here.  We reuse b_private in the buffer_heads to build
+	 * a chain for completing the I/O from user context after we've issued
+	 * a transaction to convert the unwritten extent.
+	 */
+	spin_lock_irqsave(&unwritten_done_lock, flags);
+	bh->b_private = ioend->io_buffer_head;
+	ioend->io_buffer_head = bh;
+	spin_unlock_irqrestore(&unwritten_done_lock, flags);
+
+	xfs_finish_ioend(ioend);
 }
 
 STATIC int
@@ -255,7 +290,7 @@ xfs_probe_unwritten_page(
 	struct address_space	*mapping,
 	pgoff_t			index,
 	xfs_iomap_t		*iomapp,
-	xfs_buf_t		*pb,
+	xfs_ioend_t		*ioend,
 	unsigned long		max_offset,
 	unsigned long		*fsbs,
 	unsigned int            bbits)
@@ -283,7 +318,7 @@ xfs_probe_unwritten_page(
 				break;
 			xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
 			set_buffer_unwritten_io(bh);
-			bh->b_private = pb;
+			bh->b_private = ioend;
 			p_offset += bh->b_size;
 			(*fsbs)++;
 		} while ((bh = bh->b_this_page) != head);
@@ -434,34 +469,15 @@ xfs_map_unwritten(
 {
 	struct buffer_head	*bh = curr;
 	xfs_iomap_t		*tmp;
-	xfs_buf_t		*pb;
-	loff_t			offset, size;
+	xfs_ioend_t		*ioend;
+	loff_t			offset;
 	unsigned long		nblocks = 0;
 
 	offset = start_page->index;
 	offset <<= PAGE_CACHE_SHIFT;
 	offset += p_offset;
 
-	/* get an "empty" pagebuf to manage IO completion
-	 * Proper values will be set before returning */
-	pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
-	if (!pb)
-		return -EAGAIN;
-
-	/* Take a reference to the inode to prevent it from
-	 * being reclaimed while we have outstanding unwritten
-	 * extent IO on it.
-	 */
-	if ((igrab(inode)) != inode) {
-		pagebuf_free(pb);
-		return -EAGAIN;
-	}
-
-	/* Set the count to 1 initially, this will stop an I/O
-	 * completion callout which happens before we have started
-	 * all the I/O from calling pagebuf_iodone too early.
-	 */
-	atomic_set(&pb->pb_io_remaining, 1);
+	ioend = xfs_alloc_ioend(inode);
 
 	/* First map forwards in the page consecutive buffers
 	 * covering this unwritten extent
@@ -474,12 +490,12 @@ xfs_map_unwritten(
 			break;
 		xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
 		set_buffer_unwritten_io(bh);
-		bh->b_private = pb;
+		bh->b_private = ioend;
 		p_offset += bh->b_size;
 		nblocks++;
 	} while ((bh = bh->b_this_page) != head);
 
-	atomic_add(nblocks, &pb->pb_io_remaining);
+	atomic_add(nblocks, &ioend->io_remaining);
 
 	/* If we reached the end of the page, map forwards in any
 	 * following pages which are also covered by this extent.
@@ -496,13 +512,13 @@ xfs_map_unwritten(
 		tloff = min(tlast, tloff);
 		for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
 			page = xfs_probe_unwritten_page(mapping,
-						tindex, iomapp, pb,
+						tindex, iomapp, ioend,
 						PAGE_CACHE_SIZE, &bs, bbits);
 			if (!page)
 				break;
 			nblocks += bs;
-			atomic_add(bs, &pb->pb_io_remaining);
-			xfs_convert_page(inode, page, iomapp, wbc, pb,
+			atomic_add(bs, &ioend->io_remaining);
+			xfs_convert_page(inode, page, iomapp, wbc, ioend,
 							startio, all_bh);
 			/* stop if converting the next page might add
 			 * enough blocks that the corresponding byte
@@ -514,12 +530,12 @@ xfs_map_unwritten(
 		if (tindex == tlast &&
 		    (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
 			page = xfs_probe_unwritten_page(mapping,
-							tindex, iomapp, pb,
+							tindex, iomapp, ioend,
 							pg_offset, &bs, bbits);
 			if (page) {
 				nblocks += bs;
-				atomic_add(bs, &pb->pb_io_remaining);
-				xfs_convert_page(inode, page, iomapp, wbc, pb,
+				atomic_add(bs, &ioend->io_remaining);
+				xfs_convert_page(inode, page, iomapp, wbc, ioend,
 							startio, all_bh);
 				if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
 					goto enough;
@@ -528,21 +544,9 @@ xfs_map_unwritten(
 	}
 
 enough:
-	size = nblocks;		/* NB: using 64bit number here */
-	size <<= block_bits;	/* convert fsb's to byte range */
-
-	XFS_BUF_DATAIO(pb);
-	XFS_BUF_ASYNC(pb);
-	XFS_BUF_SET_SIZE(pb, size);
-	XFS_BUF_SET_COUNT(pb, size);
-	XFS_BUF_SET_OFFSET(pb, offset);
-	XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
-	XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
-
-	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-		pagebuf_iodone(pb, 1, 1);
-	}
-
+	ioend->io_size = (xfs_off_t)nblocks << block_bits;
+	ioend->io_offset = offset;
+	xfs_finish_ioend(ioend);
 	return 0;
 }
 
@@ -744,8 +748,9 @@ xfs_page_state_convert(
 	if (page->index >= end_index) {
 		if ((page->index >= end_index + 1) ||
 		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-			err = -EIO;
-			goto error;
+			if (startio)
+				unlock_page(page);
+			return 0;
 		}
 	}
 
@@ -787,7 +792,7 @@ xfs_page_state_convert(
 				continue;
 			if (!iomp) {
 				err = xfs_map_blocks(inode, offset, len, &iomap,
-						BMAPI_READ|BMAPI_IGNSTATE);
+						BMAPI_WRITE|BMAPI_IGNSTATE);
 				if (err) {
 					goto error;
 				}
@@ -931,15 +936,18 @@ __linvfs_get_block(
 {
 	vnode_t			*vp = LINVFS_GET_VP(inode);
 	xfs_iomap_t		iomap;
+	xfs_off_t		offset;
+	ssize_t			size;
 	int			retpbbm = 1;
 	int			error;
-	ssize_t			size;
-	loff_t			offset = (loff_t)iblock << inode->i_blkbits;
 
-	if (blocks)
-		size = blocks << inode->i_blkbits;
-	else
+	if (blocks) {
+		offset = blocks << inode->i_blkbits;	/* 64 bit goodness */
+		size = (ssize_t) min_t(xfs_off_t, offset, LONG_MAX);
+	} else {
 		size = 1 << inode->i_blkbits;
+	}
+	offset = (xfs_off_t)iblock << inode->i_blkbits;
 
 	VOP_BMAP(vp, offset, size,
 		create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
@@ -950,8 +958,8 @@ __linvfs_get_block(
 		return 0;
 
 	if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
-		xfs_daddr_t		bn;
-		loff_t			delta;
+		xfs_daddr_t	bn;
+		xfs_off_t	delta;
 
 		/* For unwritten extents do not report a disk address on
 		 * the read case (treat as if we're reading into a hole).
@@ -983,9 +991,8 @@ __linvfs_get_block(
 	 */
 	if (create &&
 	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
-	     (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) {
+	     (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW)))
 		set_buffer_new(bh_result);
-	}
 
 	if (iomap.iomap_flags & IOMAP_DELAY) {
 		BUG_ON(direct);
@@ -997,9 +1004,11 @@ __linvfs_get_block(
 	}
 
 	if (blocks) {
-		bh_result->b_size = (ssize_t)min(
-			(loff_t)(iomap.iomap_bsize - iomap.iomap_delta),
-			(loff_t)(blocks << inode->i_blkbits));
+		ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
+		offset = min_t(xfs_off_t,
+				iomap.iomap_bsize - iomap.iomap_delta,
+				blocks << inode->i_blkbits);
+		bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset);
 	}
 
 	return 0;
@@ -1028,6 +1037,44 @@ linvfs_get_blocks_direct(
 					create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
 
+STATIC void
+linvfs_end_io_direct(
+	struct kiocb	*iocb,
+	loff_t		offset,
+	ssize_t		size,
+	void		*private)
+{
+	xfs_ioend_t	*ioend = iocb->private;
+
+	/*
+	 * Non-NULL private data means we need to issue a transaction to
+	 * convert a range from unwritten to written extents.  This needs
+	 * to happen from process contect but aio+dio I/O completion
+	 * happens from irq context so we need to defer it to a workqueue.
+	 * This is not nessecary for synchronous direct I/O, but we do
+	 * it anyway to keep the code uniform and simpler.
+	 *
+	 * The core direct I/O code might be changed to always call the
+	 * completion handler in the future, in which case all this can
+	 * go away.
+	 */
+	if (private && size > 0) {
+		ioend->io_offset = offset;
+		ioend->io_size = size;
+		xfs_finish_ioend(ioend);
+	} else {
+		ASSERT(size >= 0);
+		xfs_destroy_ioend(ioend);
+	}
+
+	/*
+	 * blockdev_direct_IO can return an error even afer the I/O
+	 * completion handler was called.  Thus we need to protect
+	 * against double-freeing.
+	 */
+	iocb->private = NULL;
+}
+
 STATIC ssize_t
 linvfs_direct_IO(
 	int			rw,
@@ -1042,16 +1089,23 @@ linvfs_direct_IO(
 	xfs_iomap_t	iomap;
 	int		maps = 1;
 	int		error;
+	ssize_t		ret;
 
 	VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
 	if (error)
 		return -error;
 
-	return blockdev_direct_IO_own_locking(rw, iocb, inode,
+	iocb->private = xfs_alloc_ioend(inode);
+
+	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
 		iomap.iomap_target->pbr_bdev,
 		iov, offset, nr_segs,
 		linvfs_get_blocks_direct,
-		linvfs_unwritten_convert_direct);
+		linvfs_end_io_direct);
+
+	if (unlikely(ret <= 0 && iocb->private))
+		xfs_destroy_ioend(iocb->private);
+	return ret;
 }
 
 
@@ -1202,6 +1256,16 @@ out_unlock:
 	return error;
 }
 
+STATIC int
+linvfs_invalidate_page(
+	struct page		*page,
+	unsigned long		offset)
+{
+	xfs_page_trace(XFS_INVALIDPAGE_ENTER,
+			page->mapping->host, page, offset);
+	return block_invalidatepage(page, offset);
+}
+
 /*
  * Called to move a page into cleanable state - and from there
  * to be released. Possibly the page is already clean. We always
@@ -1224,7 +1288,7 @@ out_unlock:
 STATIC int
 linvfs_release_page(
 	struct page		*page,
-	int			gfp_mask)
+	gfp_t			gfp_mask)
 {
 	struct inode		*inode = page->mapping->host;
 	int			dirty, delalloc, unmapped, unwritten;
@@ -1279,6 +1343,7 @@ struct address_space_operations linvfs_aops = {
 	.writepage		= linvfs_writepage,
 	.sync_page		= block_sync_page,
 	.releasepage		= linvfs_release_page,
+	.invalidatepage		= linvfs_invalidate_page,
 	.prepare_write		= linvfs_prepare_write,
 	.commit_write		= generic_commit_write,
 	.bmap			= linvfs_bmap,