aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_buf.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_buf.c')
-rw-r--r--fs/xfs/xfs_buf.c109
1 files changed, 91 insertions, 18 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e839907e8492..b21ea2ba768d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone;
#define xb_to_gfp(flags) \
((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ * b_sema (caller holds)
+ * b_lock
+ *
+ * xfs_buf_stale:
+ * b_sema (caller holds)
+ * b_lock
+ * lru_lock
+ *
+ * xfs_buf_rele:
+ * b_lock
+ * pag_buf_lock
+ * lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ */
static inline int
xfs_buf_is_vmapped(
@@ -749,6 +775,30 @@ _xfs_buf_read(
return xfs_buf_submit(bp);
}
+/*
+ * If the caller passed in an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use them to verify the contents. If the contents
+ * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
+ * recorded errors and is already in XBF_DONE state.
+ */
+int
+xfs_buf_ensure_ops(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+ ASSERT(bp->b_error == 0);
+
+ if (!ops || bp->b_ops)
+ return 0;
+
+ bp->b_ops = ops;
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ bp->b_flags &= ~XBF_DONE;
+ return bp->b_error;
+}
+
xfs_buf_t *
xfs_buf_read_map(
struct xfs_buftarg *target,
@@ -762,26 +812,32 @@ xfs_buf_read_map(
flags |= XBF_READ;
bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (bp) {
- trace_xfs_buf_read(bp, flags, _RET_IP_);
+ if (!bp)
+ return NULL;
- if (!(bp->b_flags & XBF_DONE)) {
- XFS_STATS_INC(target->bt_mount, xb_get_read);
- bp->b_ops = ops;
- _xfs_buf_read(bp, flags);
- } else if (flags & XBF_ASYNC) {
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
- xfs_buf_relse(bp);
- return NULL;
- } else {
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- }
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+ if (!(bp->b_flags & XBF_DONE)) {
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
+ bp->b_ops = ops;
+ _xfs_buf_read(bp, flags);
+ return bp;
}
+ xfs_buf_ensure_ops(bp, ops);
+
+ if (flags & XBF_ASYNC) {
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ ASSERT(bp->b_ops != NULL || ops == NULL);
return bp;
}
@@ -1006,8 +1062,18 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ /*
+ * We grab the b_lock here first to serialise racing xfs_buf_rele()
+ * calls. The pag_buf_lock being taken on the last reference only
+ * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+ * to last reference we drop here is not serialised against the last
+ * reference until we take bp->b_lock. Hence if we don't grab b_lock
+ * first, the last "release" reference can win the race to the lock and
+ * free the buffer before the second-to-last reference is processed,
+ * leading to a use-after-free scenario.
+ */
spin_lock(&bp->b_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers(
* is only safely useable for callers that can track I/O completion by higher
* level means, e.g. AIL pushing as the @buffer_list is consumed in this
* function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
*/
int
xfs_buf_delwri_submit_nowait(