aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_reflink.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_reflink.c')
-rw-r--r--fs/xfs/xfs_reflink.c839
1 files changed, 504 insertions, 335 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b0ce04ffd3cd..93bdd25680bc 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -27,7 +27,7 @@
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
-#include "xfs_sb.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
@@ -125,11 +125,10 @@
* shared blocks. If there are no shared extents, fbno and flen will
* be set to NULLAGBLOCK and 0, respectively.
*/
-int
+static int
xfs_reflink_find_shared(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
xfs_agblock_t *fbno,
@@ -140,11 +139,11 @@ xfs_reflink_find_shared(
struct xfs_btree_cur *cur;
int error;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
@@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared(
struct xfs_bmbt_irec *irec,
bool *shared)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
xfs_agblock_t agbno;
xfs_extlen_t aglen;
xfs_agblock_t fbno;
@@ -179,19 +179,20 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
trace_xfs_reflink_trim_around_shared(ip, irec);
- agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
- agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
aglen = irec->br_blockcount;
- error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
- aglen, &fbno, &flen, true);
+ error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
+ true);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -199,7 +200,9 @@ xfs_reflink_trim_around_shared(
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
- } else if (fbno == agbno) {
+ }
+
+ if (fbno == agbno) {
/*
* The start of this extent is shared. Truncate the
* mapping at the end of the shared region so that a
@@ -209,16 +212,16 @@ xfs_reflink_trim_around_shared(
irec->br_blockcount = flen;
*shared = true;
return 0;
- } else {
- /*
- * There's a shared extent midway through this extent.
- * Truncate the mapping at the start of the shared
- * extent so that a subsequent iteration starts at the
- * start of the shared region.
- */
- irec->br_blockcount = fbno - agbno;
- return 0;
}
+
+ /*
+ * There's a shared extent midway through this extent.
+ * Truncate the mapping at the start of the shared
+ * extent so that a subsequent iteration starts at the
+ * start of the shared region.
+ */
+ irec->br_blockcount = fbno - agbno;
+ return 0;
}
int
@@ -340,9 +343,41 @@ xfs_find_trim_cow_extent(
return 0;
}
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool convert_now)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ int error;
+
+ /*
+ * cmap might larger than imap due to cowextsize hint.
+ */
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+ return 0;
+
+ trace_xfs_reflink_convert_cow(ip, cmap);
+
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+
+ return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
@@ -351,59 +386,37 @@ xfs_reflink_allocate_cow(
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = imap->br_startoff;
- xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
- int nimaps, error = 0;
- bool found;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
-
- error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
- return error;
- if (found)
- goto convert;
+ xfs_extlen_t resblks;
+ int nimaps;
+ int error;
+ bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
xfs_iunlock(ip, *lockmode);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- *lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, *lockmode);
+ *lockmode = 0;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
if (error)
return error;
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_trans_cancel;
+ *lockmode = XFS_ILOCK_EXCL;
- /*
- * Check for an overlapping extent again now that we dropped the ilock.
- */
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
+
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ijoin(tp, ip, 0);
+ ASSERT(cmap->br_startoff > imap->br_startoff);
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
@@ -411,7 +424,7 @@ xfs_reflink_allocate_cow(
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
&nimaps);
if (error)
- goto out_unreserve;
+ goto out_trans_cancel;
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
@@ -424,26 +437,135 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
+
convert:
- xfs_trim_extent(cmap, offset_fsb, count_fsb);
- /*
- * COW fork extents are supposed to remain unwritten until we're ready
- * to initiate a disk write. For direct I/O we are going to write the
- * data and need the conversion, but for buffered writes we're done.
- */
- if (!convert_now || cmap->br_state == XFS_EXT_NORM)
- return 0;
- trace_xfs_reflink_convert_cow(ip, cmap);
- return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nimaps;
+ int error;
+ bool found;
+
+ do {
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+ false, &tp);
+ if (error)
+ return error;
+
+ *lockmode = XFS_ILOCK_EXCL;
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+ &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+
+ if (found) {
+ xfs_trans_cancel(tp);
+ break;
+ }
+
+ ASSERT(isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK);
+
+ /*
+ * Replace delalloc reservation with an unwritten extent.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+ cmap->br_blockcount,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+ cmap, &nimaps);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ return error;
+
+ /*
+ * Allocation succeeded but the requested range was not even
+ * partially satisfied? Bail out!
+ */
+ if (nimaps == 0)
+ return -ENOSPC;
+ } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
-out_unreserve:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ int error;
+ bool found;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+ if (error || !*shared)
+ return error;
+
+ /* CoW fork has a real extent */
+ if (found)
+ return xfs_reflink_convert_unwritten(ip, imap, cmap,
+ convert_now);
+
+ /*
+ * CoW fork does not have an extent and data extent is shared.
+ * Allocate a real extent in the CoW fork.
+ */
+ if (cmap->br_startoff > imap->br_startoff)
+ return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /*
+ * CoW fork has a delalloc reservation. Replace it with a real extent.
+ * There may or may not be a data fork mapping.
+ */
+ if (isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK)
+ return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /* Shouldn't get here. */
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
/*
* Cancel CoW reservations for some block range of an inode.
*
@@ -461,7 +583,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t end_fsb,
bool cancel_real)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -496,7 +618,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
del.br_blockcount);
- xfs_bmap_add_free(*tpp, del.br_startblock,
+ xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL);
/* Roll the transaction */
@@ -508,9 +630,8 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -(long)del.br_blockcount, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_unreserve_blkres(ip,
+ del.br_blockcount);
if (error)
break;
} else {
@@ -596,21 +717,21 @@ out:
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- xfs_fileoff_t *end_fsb)
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- xfs_filblks_t rlen;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
+ int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
- *end_fsb = offset_fsb;
+ *offset_fsb = end_fsb;
return 0;
}
@@ -628,47 +749,79 @@ xfs_reflink_end_cow_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ goto out_cancel;
+
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
- got.br_startoff + got.br_blockcount <= offset_fsb) {
- *end_fsb = offset_fsb;
+ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
goto out_cancel;
}
/*
- * Structure copy @got into @del, then trim @del to the range that we
- * were asked to remap. We preserve @got for the eventual CoW fork
- * deletion; from now on @del represents the mapping that we're
- * actually remapping.
- */
- del = got;
- xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
- ASSERT(del.br_blockcount > 0);
-
- /*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
- * need to skip them.
+ * need to skip them. Preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
*/
- if (!xfs_bmap_is_real_extent(&got)) {
- *end_fsb = del.br_startoff;
- goto out_cancel;
+ while (!xfs_bmap_is_written_extent(&got)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
+ goto out_cancel;
+ }
}
+ del = got;
- /* Unmap the old blocks in the data fork. */
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ /* Grab the corresponding mapping in the data fork. */
+ nmaps = 1;
+ error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+ &nmaps, 0);
if (error)
goto out_cancel;
- /* Trim the extent to whatever got unmapped. */
- xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* We can only remap the smaller of the two extent sizes. */
+ data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+ del.br_blockcount = data.br_blockcount;
+
+ trace_xfs_reflink_cow_remap_from(ip, &del);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ if (xfs_bmap_is_real_extent(&data)) {
+ /*
+ * If the extent we're remapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_refcount_decrease_extent(tp, &data);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -data.br_blockcount);
+ } else if (data.br_startblock == DELAYSTARTBLOCK) {
+ int done;
+
+ /*
+ * If the extent we're remapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, data.br_startoff,
+ data.br_blockcount, 0, 1, &done);
+ if (error)
+ goto out_cancel;
+ ASSERT(done);
+ }
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -689,7 +842,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
- *end_fsb = del.br_startoff;
+ *offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
@@ -717,11 +870,11 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
- * Walk backwards until we're out of the I/O range. The loop function
+ * Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
- * If we're being called by writeback then the the pages will still
+ * If we're being called by writeback then the pages will still
* have PageWriteback set, which prevents races with reflink remapping
* and truncate. Reflink remapping prevents races with writeback by
* taking the iolock and mmaplock before flushing the pages and
@@ -749,7 +902,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
- error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+ error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -757,22 +910,28 @@ xfs_reflink_end_cow(
}
/*
- * Free leftover CoW reservations that didn't get cleaned out.
+ * Free all CoW staging blocks that are still referenced by the ondisk refcount
+ * metadata. The ondisk metadata does not track which inode created the
+ * staging extent, so callers must ensure that there are no cached inodes with
+ * live CoW staging extents.
*/
int
xfs_reflink_recover_cow(
struct xfs_mount *mp)
{
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
int error = 0;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ if (!xfs_has_reflink(mp))
return 0;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- error = xfs_refcount_recover_cow_leftovers(mp, agno);
- if (error)
+ for_each_perag(mp, agno, pag) {
+ error = xfs_refcount_recover_cow_leftovers(mp, pag);
+ if (error) {
+ xfs_perag_put(pag);
break;
+ }
}
return error;
@@ -882,7 +1041,7 @@ xfs_reflink_set_inode_flag(
if (!xfs_is_reflink_inode(src)) {
trace_xfs_reflink_set_inode_flag(src);
xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
- src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
xfs_ifork_init_cow(src);
} else
@@ -894,7 +1053,7 @@ xfs_reflink_set_inode_flag(
if (!xfs_is_reflink_inode(dest)) {
trace_xfs_reflink_set_inode_flag(dest);
xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
- dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
xfs_ifork_init_cow(dest);
} else
@@ -938,12 +1097,12 @@ xfs_reflink_update_dest(
if (newlen > i_size_read(VFS_I(dest))) {
trace_xfs_reflink_update_inode_size(dest, newlen);
i_size_write(VFS_I(dest), newlen);
- dest->i_d.di_size = newlen;
+ dest->i_disk_size = newlen;
}
if (cowextsize) {
- dest->i_d.di_cowextsize = cowextsize;
- dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ dest->i_cowextsize = cowextsize;
+ dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
}
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
@@ -972,7 +1131,7 @@ xfs_reflink_ag_has_free_space(
struct xfs_perag *pag;
int error = 0;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
pag = xfs_perag_get(mp, agno);
@@ -984,127 +1143,208 @@ xfs_reflink_ag_has_free_space(
}
/*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file. The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
*/
STATIC int
xfs_reflink_remap_extent(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec,
- xfs_fileoff_t destoff,
+ struct xfs_bmbt_irec *dmap,
xfs_off_t new_isize)
{
+ struct xfs_bmbt_irec smap;
struct xfs_mount *mp = ip->i_mount;
- bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
- unsigned int resblks;
- struct xfs_bmbt_irec uirec;
- xfs_filblks_t rlen;
- xfs_filblks_t unmap_len;
xfs_off_t newlen;
+ int64_t qdelta = 0;
+ unsigned int resblks;
+ bool quota_reserved = true;
+ bool smap_real;
+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int iext_delta = 0;
+ int nimaps;
int error;
- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
- /* No reflinking if we're low on space */
- if (real_extent) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- if (error)
- goto out;
+ /*
+ * Start a rolling transaction to switch the mappings.
+ *
+ * Adding a written extent to the extent map can cause a bmbt split,
+ * and removing a mapped extent from the extent can cause a bmbt split.
+ * The two operations cannot both cause a split since they operate on
+ * the same index in the bmap btree, so we only need a reservation for
+ * one bmbt split if either thing is happening. However, we haven't
+ * locked the inode yet, so we reserve assuming this is the case.
+ *
+ * The first allocation call tries to reserve enough space to handle
+ * mapping dmap into a sparse part of the file plus the bmbt split. We
+ * haven't locked the inode or read the existing mapping yet, so we do
+ * not know for sure that we need the space. This should succeed most
+ * of the time.
+ *
+ * If the first attempt fails, try again but reserving only enough
+ * space to handle a bmbt split. This is the hard minimum requirement,
+ * and we revisit quota reservations later when we know more about what
+ * we're remapping.
+ */
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks + dmap->br_blockcount, 0, false, &tp);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ quota_reserved = false;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks, 0, false, &tp);
}
-
- /* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ /*
+ * Read what's currently mapped in the destination file into smap.
+ * If smap isn't a hole, we will have to remove it before we can add
+ * dmap to the destination file.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+ &smap, &nimaps, 0);
+ if (error)
+ goto out_cancel;
+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+ smap_real = xfs_bmap_is_real_extent(&smap);
- /* If we're not just clearing space, then do we have enough quota? */
- if (real_extent) {
- error = xfs_trans_reserve_quota_nblks(tp, ip,
- irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_cancel;
+ /*
+ * We can only remap as many blocks as the smaller of the two extent
+ * maps, because we can only remap one extent at a time.
+ */
+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
+
+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+ /*
+ * Two extents mapped to the same physical block must not have
+ * different states; that's filesystem corruption. Move on to the next
+ * extent if they're both holes or both the same physical extent.
+ */
+ if (dmap->br_startblock == smap.br_startblock) {
+ if (dmap->br_state != smap.br_state)
+ error = -EFSCORRUPTED;
+ goto out_cancel;
}
- trace_xfs_reflink_remap(ip, irec->br_startoff,
- irec->br_blockcount, irec->br_startblock);
+ /* If both extents are unwritten, leave them alone. */
+ if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+ smap.br_state == XFS_EXT_UNWRITTEN)
+ goto out_cancel;
- /* Unmap the old blocks in the data fork. */
- rlen = unmap_len;
- while (rlen) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+ /* No reflinking if the AG of the dest mapping is low on space. */
+ if (dmap_written) {
+ error = xfs_reflink_ag_has_free_space(mp,
+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
if (error)
goto out_cancel;
+ }
- /*
- * Trim the extent to whatever got unmapped.
- * Remember, bunmapi works backwards.
- */
- uirec.br_startblock = irec->br_startblock + rlen;
- uirec.br_startoff = irec->br_startoff + rlen;
- uirec.br_blockcount = unmap_len - rlen;
- unmap_len = rlen;
-
- /* If this isn't a real mapping, we're done. */
- if (!real_extent || uirec.br_blockcount == 0)
- goto next_extent;
+ /*
+ * Increase quota reservation if we think the quota block counter for
+ * this file could increase.
+ *
+ * If we are mapping a written extent into the file, we need to have
+ * enough quota block count reservation to handle the blocks in that
+ * extent. We log only the delta to the quota block counts, so if the
+ * extent we're unmapping also has blocks allocated to it, we don't
+ * need a quota reservation for the extent itself.
+ *
+ * Note that if we're replacing a delalloc reservation with a written
+ * extent, we have to take the full quota reservation because removing
+ * the delalloc reservation gives the block count back to the quota
+ * count. This is suboptimal, but the VFS flushed the dest range
+ * before we started. That should have removed all the delalloc
+ * reservations, but we code defensively.
+ *
+ * xfs_trans_alloc_inode above already tried to grab an even larger
+ * quota reservation, and kicked off a blockgc scan if it couldn't.
+ * If we can't get a potentially smaller quota reservation now, we're
+ * done.
+ */
+ if (!quota_reserved && !smap_real && dmap_written) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip,
+ dmap->br_blockcount, 0, false);
+ if (error)
+ goto out_cancel;
+ }
- trace_xfs_reflink_remap(ip, uirec.br_startoff,
- uirec.br_blockcount, uirec.br_startblock);
+ if (smap_real)
+ ++iext_delta;
- /* Update the refcount tree */
- xfs_refcount_increase_extent(tp, &uirec);
+ if (dmap_written)
+ ++iext_delta;
- /* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
+ if (error)
+ goto out_cancel;
- /* Update quota accounting. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- uirec.br_blockcount);
-
- /* Update dest isize if needed. */
- newlen = XFS_FSB_TO_B(mp,
- uirec.br_startoff + uirec.br_blockcount);
- newlen = min_t(xfs_off_t, newlen, new_isize);
- if (newlen > i_size_read(VFS_I(ip))) {
- trace_xfs_reflink_update_inode_size(ip, newlen);
- i_size_write(VFS_I(ip), newlen);
- ip->i_d.di_size = newlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
+ if (smap_real) {
+ /*
+ * If the extent we're unmapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_refcount_decrease_extent(tp, &smap);
+ qdelta -= smap.br_blockcount;
+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+ int done;
-next_extent:
- /* Process all the deferred stuff. */
- error = xfs_defer_finish(&tp);
+ /*
+ * If the extent we're unmapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+ smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
+ ASSERT(done);
+ }
+
+ /*
+ * If the extent we're sharing is backed by written storage, increase
+ * its refcount and map it into the file.
+ */
+ if (dmap_written) {
+ xfs_refcount_increase_extent(tp, dmap);
+ xfs_bmap_map_extent(tp, ip, dmap);
+ qdelta += dmap->br_blockcount;
+ }
+
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+
+ /* Update dest isize if needed. */
+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+ newlen = min_t(xfs_off_t, newlen, new_isize);
+ if (newlen > i_size_read(VFS_I(ip))) {
+ trace_xfs_reflink_update_inode_size(ip, newlen);
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_disk_size = newlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+ /* Commit everything and unlock. */
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
- return 0;
+ goto out_unlock;
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
return error;
}
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
@@ -1115,25 +1355,22 @@ xfs_reflink_remap_blocks(
loff_t *remapped)
{
struct xfs_bmbt_irec imap;
- xfs_fileoff_t srcoff;
- xfs_fileoff_t destoff;
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
xfs_filblks_t len;
- xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
- srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
- len = XFS_B_TO_FSB(src->i_mount, remap_len);
+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+ XFS_MAX_FILEOFF);
- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
- while (len) {
- uint lock_mode;
+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
- dest, destoff);
+ while (len > 0) {
+ unsigned int lock_mode;
/* Read extent from the source file */
nimaps = 1;
@@ -1142,18 +1379,25 @@ xfs_reflink_remap_blocks(
xfs_iunlock(src, lock_mode);
if (error)
break;
- ASSERT(nimaps == 1);
-
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
- &imap);
+ /*
+ * The caller supposedly flushed all dirty pages in the source
+ * file range, which means that writeback should have allocated
+ * or deleted all delalloc reservations in that range. If we
+ * find one, that's a good sign that something is seriously
+ * wrong here.
+ */
+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = -EFSCORRUPTED;
+ break;
+ }
- /* Translate imap into the destination file. */
- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
- imap.br_startoff += destoff - srcoff;
+ trace_xfs_reflink_remap_extent_src(src, &imap);
- /* Clear dest from destoff to the end of imap and map it in. */
- error = xfs_reflink_remap_extent(dest, &imap, destoff,
- new_isize);
+ /* Remap into the destination file at the given offset. */
+ imap.br_startoff = destoff;
+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
if (error)
break;
@@ -1163,10 +1407,10 @@ xfs_reflink_remap_blocks(
}
/* Advance drange/srange */
- srcoff += range_len;
- destoff += range_len;
- len -= range_len;
- remapped_len += range_len;
+ srcoff += imap.br_blockcount;
+ destoff += imap.br_blockcount;
+ len -= imap.br_blockcount;
+ remapped_len += imap.br_blockcount;
}
if (error)
@@ -1177,81 +1421,6 @@ xfs_reflink_remap_blocks(
}
/*
- * Grab the exclusive iolock for a data copy from src to dest, making sure to
- * abide vfs locking order (lowest pointer value goes first) and breaking the
- * layout leases before proceeding. The loop is needed because we cannot call
- * the blocking break_layout() with the iolocks held, and therefore have to
- * back out both locks.
- */
-static int
-xfs_iolock_two_inodes_and_break_layout(
- struct inode *src,
- struct inode *dest)
-{
- int error;
-
- if (src > dest)
- swap(src, dest);
-
-retry:
- /* Wait to break both inodes' layouts before we start locking. */
- error = break_layout(src, true);
- if (error)
- return error;
- if (src != dest) {
- error = break_layout(dest, true);
- if (error)
- return error;
- }
-
- /* Lock one inode and make sure nobody got in and leased it. */
- inode_lock(src);
- error = break_layout(src, false);
- if (error) {
- inode_unlock(src);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- if (src == dest)
- return 0;
-
- /* Lock the other inode and make sure nobody got in and leased it. */
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
- error = break_layout(dest, false);
- if (error) {
- inode_unlock(src);
- inode_unlock(dest);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- return 0;
-}
-
-/* Unlock both inodes after they've been prepped for a range clone. */
-void
-xfs_reflink_remap_unlock(
- struct file *file_in,
- struct file *file_out)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
-
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock(inode_in);
-}
-
-/*
* If we're reflinking to a point past the destination file's EOF, we must
* zero any speculative post-EOF preallocations that sit between the old EOF
* and the destination file offset.
@@ -1267,8 +1436,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_buffered_write_iomap_ops);
+ return xfs_zero_range(ip, isize, pos - isize, NULL);
}
/*
@@ -1313,18 +1481,12 @@ xfs_reflink_remap_prep(
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
- ssize_t ret;
+ int ret;
/* Lock both files against IO */
- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+ ret = xfs_ilock2_io_mmap(src, dest);
if (ret)
return ret;
- if (same_inode)
- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
- XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
@@ -1332,13 +1494,17 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
- if (ret < 0 || *len == 0)
+ if (!IS_DAX(inode_in))
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags);
+ else
+ ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, &xfs_read_iomap_ops);
+ if (ret || *len == 0)
goto out_unlock;
/* Attach dquots to dest inode before changing block map */
@@ -1373,9 +1539,9 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
- return 1;
+ return 0;
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
return ret;
}
@@ -1389,36 +1555,37 @@ xfs_reflink_inode_has_shared_extents(
struct xfs_bmbt_irec got;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
struct xfs_iext_cursor icur;
bool found;
int error;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
- if (error)
- return error;
- }
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
*has_shared = false;
found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
+ struct xfs_perag *pag;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
goto next;
- agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
aglen = got.br_blockcount;
-
- error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
+ error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
&rbno, &rlen, false);
+ xfs_perag_put(pag);
if (error)
return error;
+
/* Is there still a shared block here? */
if (rbno != NULLAGBLOCK) {
*has_shared = true;
@@ -1462,7 +1629,7 @@ xfs_reflink_clear_inode_flag(
/* Clear the inode flag. */
trace_xfs_reflink_unset_inode_flag(ip);
- ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
@@ -1530,7 +1697,9 @@ xfs_reflink_unshare(
&xfs_buffered_write_iomap_ops);
if (error)
goto out;
- error = filemap_write_and_wait(inode->i_mapping);
+
+ error = filemap_write_and_wait_range(inode->i_mapping, offset,
+ offset + len - 1);
if (error)
goto out;