aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--fs/afs/file.c5
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/aio.c186
-rw-r--r--fs/attr.c4
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/backref.c77
-rw-r--r--fs/btrfs/block-group.c35
-rw-r--r--fs/btrfs/block-rsv.c84
-rw-r--r--fs/btrfs/block-rsv.h5
-rw-r--r--fs/btrfs/btrfs_inode.h18
-rw-r--r--fs/btrfs/compression.c11
-rw-r--r--fs/btrfs/ctree.c565
-rw-r--r--fs/btrfs/ctree.h163
-rw-r--r--fs/btrfs/delalloc-space.c14
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/delayed-ref.c25
-rw-r--r--fs/btrfs/dev-replace.c11
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c404
-rw-r--r--fs/btrfs/disk-io.h11
-rw-r--r--fs/btrfs/extent-tree.c160
-rw-r--r--fs/btrfs/extent_io.c74
-rw-r--r--fs/btrfs/file-item.c33
-rw-r--r--fs/btrfs/free-space-cache.c322
-rw-r--r--fs/btrfs/free-space-cache.h10
-rw-r--r--fs/btrfs/free-space-tree.c54
-rw-r--r--fs/btrfs/inode-item.c344
-rw-r--r--fs/btrfs/inode-item.h96
-rw-r--r--fs/btrfs/inode.c643
-rw-r--r--fs/btrfs/ioctl.c91
-rw-r--r--fs/btrfs/lzo.c15
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/props.c7
-rw-r--r--fs/btrfs/qgroup.c27
-rw-r--r--fs/btrfs/reada.c1086
-rw-r--r--fs/btrfs/ref-verify.c8
-rw-r--r--fs/btrfs/reflink.c2
-rw-r--r--fs/btrfs/relocation.c41
-rw-r--r--fs/btrfs/root-tree.c9
-rw-r--r--fs/btrfs/scrub.c234
-rw-r--r--fs/btrfs/send.c417
-rw-r--r--fs/btrfs/space-info.c93
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/sysfs.c11
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c17
-rw-r--r--fs/btrfs/tests/extent-io-tests.c52
-rw-r--r--fs/btrfs/tests/free-space-tests.c186
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c5
-rw-r--r--fs/btrfs/tests/qgroup-tests.c5
-rw-r--r--fs/btrfs/transaction.c162
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-checker.c56
-rw-r--r--fs/btrfs/tree-defrag.c8
-rw-r--r--fs/btrfs/tree-log.c626
-rw-r--r--fs/btrfs/uuid-tree.c10
-rw-r--r--fs/btrfs/verity.c2
-rw-r--r--fs/btrfs/volumes.c145
-rw-r--r--fs/btrfs/volumes.h9
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/btrfs/zoned.c122
-rw-r--r--fs/btrfs/zoned.h30
-rw-r--r--fs/cachefiles/bind.c2
-rw-r--r--fs/ceph/caps.c16
-rw-r--r--fs/ceph/file.c20
-rw-r--r--fs/ceph/mds_client.c3
-rw-r--r--fs/cifs/cifs_swn.c16
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c41
-rw-r--r--fs/cifs/dfs_cache.c7
-rw-r--r--fs/cifs/fs_context.c38
-rw-r--r--fs/cifs/fscache.c46
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/sess.c70
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/dlm/ast.c16
-rw-r--r--fs/dlm/debug_fs.c96
-rw-r--r--fs/dlm/dir.c3
-rw-r--r--fs/dlm/dlm_internal.h12
-rw-r--r--fs/dlm/lock.c109
-rw-r--r--fs/dlm/lock.h4
-rw-r--r--fs/dlm/lockspace.c38
-rw-r--r--fs/dlm/lowcomms.c209
-rw-r--r--fs/dlm/lowcomms.h6
-rw-r--r--fs/dlm/main.c3
-rw-r--r--fs/dlm/member.c3
-rw-r--r--fs/dlm/memory.c68
-rw-r--r--fs/dlm/memory.h6
-rw-r--r--fs/dlm/midcomms.c85
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/dlm/recoverd.c3
-rw-r--r--fs/dlm/requestqueue.c17
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/erofs/Makefile2
-rw-r--r--fs/erofs/compress.h4
-rw-r--r--fs/erofs/data.c138
-rw-r--r--fs/erofs/decompressor.c134
-rw-r--r--fs/erofs/decompressor_lzma.c19
-rw-r--r--fs/erofs/erofs_fs.h18
-rw-r--r--fs/erofs/inode.c68
-rw-r--r--fs/erofs/internal.h52
-rw-r--r--fs/erofs/super.c121
-rw-r--r--fs/erofs/sysfs.c256
-rw-r--r--fs/erofs/utils.c8
-rw-r--r--fs/erofs/xattr.c135
-rw-r--r--fs/erofs/xattr.h1
-rw-r--r--fs/erofs/zdata.c170
-rw-r--r--fs/erofs/zdata.h24
-rw-r--r--fs/erofs/zmap.c159
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/dir.c1
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/ext4_jbd2.c2
-rw-r--r--fs/ext4/extents.c16
-rw-r--r--fs/ext4/fast_commit.c207
-rw-r--r--fs/ext4/fast_commit.h27
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/inode.c60
-rw-r--r--fs/ext4/ioctl.c324
-rw-r--r--fs/ext4/mballoc.c52
-rw-r--r--fs/ext4/migrate.c23
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/resize.c19
-rw-r--r--fs/ext4/super.c1913
-rw-r--r--fs/ext4/sysfs.c36
-rw-r--r--fs/file.c68
-rw-r--r--fs/fs_parser.c31
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/file.c9
-rw-r--r--fs/gfs2/glock.c26
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/inode.c109
-rw-r--r--fs/gfs2/super.c22
-rw-r--r--fs/hostfs/hostfs_kern.c3
-rw-r--r--fs/inode.c2
-rw-r--r--fs/io-wq.c38
-rw-r--r--fs/io_uring.c91
-rw-r--r--fs/iomap/buffered-io.c26
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jffs2/file.c40
-rw-r--r--fs/ksmbd/ndr.c2
-rw-r--r--fs/ksmbd/smb2ops.c3
-rw-r--r--fs/ksmbd/smb2pdu.c59
-rw-r--r--fs/ksmbd/smbacl.c19
-rw-r--r--fs/ksmbd/smbacl.h5
-rw-r--r--fs/namespace.c62
-rw-r--r--fs/netfs/read_helper.c25
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/fs_context.c1
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/nfs42proc.c4
-rw-r--r--fs/nfs/nfs42xdr.c3
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfsd/export.c2
-rw-r--r--fs/nfsd/nfs3proc.c11
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c9
-rw-r--r--fs/nfsd/nfs4xdr.c7
-rw-r--r--fs/nfsd/nfsctl.c14
-rw-r--r--fs/nfsd/nfsproc.c8
-rw-r--r--fs/ntfs/Kconfig1
-rw-r--r--fs/open.c8
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/posix_acl.c17
-rw-r--r--fs/proc/proc_net.c19
-rw-r--r--fs/proc/vmcore.c20
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/blk.c2
-rw-r--r--fs/pstore/ftrace.c46
-rw-r--r--fs/select.c64
-rw-r--r--fs/signalfd.c12
-rw-r--r--fs/smbfs_common/cifs_arc4.c13
-rw-r--r--fs/tracefs/inode.c76
-rw-r--r--fs/ubifs/Makefile2
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/ubifs/gc.c19
-rw-r--r--fs/ubifs/io.c21
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/super.c23
-rw-r--r--fs/ubifs/sysfs.c153
-rw-r--r--fs/ubifs/ubifs.h35
-rw-r--r--fs/udf/dir.c32
-rw-r--r--fs/udf/namei.c3
-rw-r--r--fs/udf/super.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/scrub/dir.c15
-rw-r--r--fs/xfs/scrub/inode.c14
-rw-r--r--fs/xfs/scrub/quota.c4
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/scrub.c4
-rw-r--r--fs/xfs/scrub/scrub.h1
-rw-r--r--fs/xfs/xfs_buf_item_recover.c2
-rw-r--r--fs/xfs/xfs_dquot.c79
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_icache.c24
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_ioctl.c5
-rw-r--r--fs/xfs/xfs_ioctl.h5
-rw-r--r--fs/xfs/xfs_iops.c40
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log_cil.c52
-rw-r--r--fs/xfs/xfs_log_recover.c26
-rw-r--r--fs/xfs/xfs_mount.c10
-rw-r--r--fs/xfs/xfs_qm_syscalls.c11
-rw-r--r--fs/xfs/xfs_reflink.c5
-rw-r--r--fs/xfs/xfs_super.c23
-rw-r--r--fs/xfs/xfs_symlink.c33
-rw-r--r--fs/xfs/xfs_sysfs.c16
-rw-r--r--fs/xfs/xfs_trans.c11
-rw-r--r--fs/zonefs/super.c1
218 files changed, 7925 insertions, 5653 deletions
diff --git a/fs/afs/file.c b/fs/afs/file.c
index cb6ad61eec3b..afe4b803f84b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -514,8 +514,9 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
down_write(&vnode->volume->cell->fs_open_mmaps_lock);
- list_add_tail(&vnode->cb_mmap_link,
- &vnode->volume->cell->fs_open_mmaps);
+ if (list_empty(&vnode->cb_mmap_link))
+ list_add_tail(&vnode->cb_mmap_link,
+ &vnode->volume->cell->fs_open_mmaps);
up_write(&vnode->volume->cell->fs_open_mmaps_lock);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index d110def8aa8e..34c68724c98b 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -667,6 +667,7 @@ static void afs_i_init_once(void *_vnode)
INIT_LIST_HEAD(&vnode->pending_locks);
INIT_LIST_HEAD(&vnode->granted_locks);
INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
+ INIT_LIST_HEAD(&vnode->cb_mmap_link);
seqlock_init(&vnode->cb_lock);
}
diff --git a/fs/aio.c b/fs/aio.c
index 9c81cf611d65..f6f1cbffef9e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -181,8 +181,9 @@ struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
bool cancelled;
+ bool work_scheduled;
+ bool work_need_resched;
struct wait_queue_entry wait;
struct work_struct work;
};
@@ -1619,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
iocb_put(iocb);
}
+/*
+ * Safely lock the waitqueue which the request is on, synchronizing with the
+ * case where the ->poll() provider decides to free its waitqueue early.
+ *
+ * Returns true on success, meaning that req->head->lock was locked, req->wait
+ * is on req->head, and an RCU read lock was taken. Returns false if the
+ * request was already removed from its waitqueue (which might no longer exist).
+ */
+static bool poll_iocb_lock_wq(struct poll_iocb *req)
+{
+ wait_queue_head_t *head;
+
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us, then check whether the request is still on the queue.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ head = smp_load_acquire(&req->head);
+ if (head) {
+ spin_lock(&head->lock);
+ if (!list_empty(&req->wait.entry))
+ return true;
+ spin_unlock(&head->lock);
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+static void poll_iocb_unlock_wq(struct poll_iocb *req)
+{
+ spin_unlock(&req->head->lock);
+ rcu_read_unlock();
+}
+
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1638,14 +1684,27 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock);
- if (!mask && !READ_ONCE(req->cancelled)) {
- add_wait_queue(req->head, &req->wait);
- spin_unlock_irq(&ctx->ctx_lock);
- return;
- }
+ if (poll_iocb_lock_wq(req)) {
+ if (!mask && !READ_ONCE(req->cancelled)) {
+ /*
+ * The request isn't actually ready to be completed yet.
+ * Reschedule completion if another wakeup came in.
+ */
+ if (req->work_need_resched) {
+ schedule_work(&req->work);
+ req->work_need_resched = false;
+ } else {
+ req->work_scheduled = false;
+ }
+ poll_iocb_unlock_wq(req);
+ spin_unlock_irq(&ctx->ctx_lock);
+ return;
+ }
+ list_del_init(&req->wait.entry);
+ poll_iocb_unlock_wq(req);
+ } /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb);
@@ -1657,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
struct poll_iocb *req = &aiocb->poll;
- spin_lock(&req->head->lock);
- WRITE_ONCE(req->cancelled, true);
- if (!list_empty(&req->wait.entry)) {
- list_del_init(&req->wait.entry);
- schedule_work(&aiocb->poll.work);
- }
- spin_unlock(&req->head->lock);
+ if (poll_iocb_lock_wq(req)) {
+ WRITE_ONCE(req->cancelled, true);
+ if (!req->work_scheduled) {
+ schedule_work(&aiocb->poll.work);
+ req->work_scheduled = true;
+ }
+ poll_iocb_unlock_wq(req);
+ } /* else, the request was force-cancelled by POLLFREE already */
return 0;
}
@@ -1680,21 +1740,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events))
return 0;
- list_del_init(&req->wait.entry);
-
- if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ /*
+ * Complete the request inline if possible. This requires that three
+ * conditions be met:
+ * 1. An event mask must have been passed. If a plain wakeup was done
+ * instead, then mask == 0 and we have to call vfs_poll() to get
+ * the events, so inline completion isn't possible.
+ * 2. The completion work must not have already been scheduled.
+ * 3. ctx_lock must not be busy. We have to use trylock because we
+ * already hold the waitqueue lock, so this inverts the normal
+ * locking order. Use irqsave/irqrestore because not all
+ * filesystems (e.g. fuse) call this function with IRQs disabled,
+ * yet IRQs have to be disabled before ctx_lock is obtained.
+ */
+ if (mask && !req->work_scheduled &&
+ spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;
- /*
- * Try to complete the iocb inline if we can. Use
- * irqsave/irqrestore because not all filesystems (e.g. fuse)
- * call this function with IRQs disabled and because IRQs
- * have to be disabled before ctx_lock is obtained.
- */
+ list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_allowed()) {
+ if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
@@ -1703,7 +1769,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb)
iocb_put(iocb);
} else {
- schedule_work(&req->work);
+ /*
+ * Schedule the completion work if needed. If it was already
+ * scheduled, record that another wakeup came in.
+ *
+ * Don't remove the request from the waitqueue here, as it might
+ * not actually be complete yet (we won't know until vfs_poll()
+ * is called), and we must not miss any wakeups. POLLFREE is an
+ * exception to this; see below.
+ */
+ if (req->work_scheduled) {
+ req->work_need_resched = true;
+ } else {
+ schedule_work(&req->work);
+ req->work_scheduled = true;
+ }
+
+ /*
+ * If the waitqueue is being freed early but we can't complete
+ * the request inline, we have to tear down the request as best
+ * we can. That means immediately removing the request from its
+ * waitqueue and preventing all further accesses to the
+ * waitqueue via the request. We also need to schedule the
+ * completion work (done above). Also mark the request as
+ * cancelled, to potentially skip an unneeded call to ->poll().
+ */
+ if (mask & POLLFREE) {
+ WRITE_ONCE(req->cancelled, true);
+ list_del_init(&req->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&req->head, NULL);
+ }
}
return 1;
}
@@ -1711,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct aio_poll_table {
struct poll_table_struct pt;
struct aio_kiocb *iocb;
+ bool queued;
int error;
};
@@ -1721,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
/* multiple wait queues per file are not supported */
- if (unlikely(pt->iocb->poll.head)) {
+ if (unlikely(pt->queued)) {
pt->error = -EINVAL;
return;
}
+ pt->queued = true;
pt->error = 0;
pt->iocb->poll.head = head;
add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1750,12 +1854,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->done = false;
req->cancelled = false;
+ req->work_scheduled = false;
+ req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
+ apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
@@ -1764,23 +1870,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
mask = vfs_poll(req->file, &apt.pt) & req->events;
spin_lock_irq(&ctx->ctx_lock);
- if (likely(req->head)) {
- spin_lock(&req->head->lock);
- if (unlikely(list_empty(&req->wait.entry))) {
- if (apt.error)
+ if (likely(apt.queued)) {
+ bool on_queue = poll_iocb_lock_wq(req);
+
+ if (!on_queue || req->work_scheduled) {
+ /*
+ * aio_poll_wake() already either scheduled the async
+ * completion work, or completed the request inline.
+ */
+ if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
}
if (mask || apt.error) {
+ /* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} else if (cancel) {
+ /* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
- } else if (!req->done) { /* actually waiting for an event */
+ } else if (on_queue) {
+ /*
+ * Actually waiting for an event, so add the request to
+ * active_reqs so that it can be cancelled if needed.
+ */
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
}
- spin_unlock(&req->head->lock);
+ if (on_queue)
+ poll_iocb_unlock_wq(req);
}
if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask);
diff --git a/fs/attr.c b/fs/attr.c
index 473d21b3a86d..66899b6e9bd8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -35,7 +35,7 @@ static bool chown_ok(struct user_namespace *mnt_userns,
kuid_t uid)
{
kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid))
+ if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
@@ -62,7 +62,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns,
{
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
- (in_group_p(gid) || gid_eq(gid, kgid)))
+ (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3dcf9bcc2326..4188ba3fd8c3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 309516e6a968..43c89952b7d2 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -234,6 +234,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -317,6 +324,13 @@ static void btrfs_work_helper(struct work_struct *normal_work)
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
} else {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f735b8798ba1..c9ee579bc5a6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -950,7 +950,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
@@ -1049,12 +1049,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_keyed_refs(struct btrfs_fs_info *fs_info,
+static int add_keyed_refs(struct btrfs_root *extent_root,
struct btrfs_path *path, u64 bytenr,
int info_level, struct preftrees *preftrees,
struct share_check *sc)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
int ret;
int slot;
struct extent_buffer *leaf;
@@ -1170,6 +1170,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct ulist *roots, const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1203,28 +1204,26 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
- /*
- * grab both a lock on the path and a lock on the delayed ref head.
- * We need both to get a consistent picture of how the refs look
- * at a specified point in time
- */
again:
head = NULL;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
time_seq != BTRFS_SEQ_LAST) {
-#else
- if (trans && time_seq != BTRFS_SEQ_LAST) {
-#endif
/*
- * look if there are updates for this ref queued and lock the
- * head
+ * We have a specific time_seq we care about and trans which
+ * means we have the path lock, we need to grab the ref head and
+ * lock it so we have a consistent view of the refs at the given
+ * time.
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -1271,7 +1270,7 @@ again:
&info_level, &preftrees, sc);
if (ret)
goto out;
- ret = add_keyed_refs(fs_info, path, bytenr, info_level,
+ ret = add_keyed_refs(root, path, bytenr, info_level,
&preftrees, sc);
if (ret)
goto out;
@@ -1360,10 +1359,18 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
@@ -1740,6 +1747,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags_ret)
{
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
int ret;
u64 flags;
u64 size = 0;
@@ -1755,11 +1763,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
key.objectid = logical;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1779,7 +1787,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
@@ -1962,7 +1970,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
extent_item_objectid);
if (!search_commit_root) {
- trans = btrfs_attach_transaction(fs_info->extent_root);
+ trans = btrfs_attach_transaction(fs_info->tree_root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT &&
PTR_ERR(trans) != -EROFS)
@@ -2058,7 +2066,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u64 parent = 0;
int found = 0;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2084,10 +2091,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item = btrfs_item_nr(slot);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
- for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
@@ -2143,7 +2149,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr_offset(eb, slot);
cur_offset = 0;
@@ -2330,6 +2336,7 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
{
struct btrfs_fs_info *fs_info = iter->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_path *path = iter->path;
struct btrfs_extent_item *ei;
struct btrfs_key key;
@@ -2340,7 +2347,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
key.offset = (u64)-1;
iter->bytenr = bytenr;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (ret == 0) {
@@ -2364,7 +2371,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->end_ptr = (u32)(iter->item_ptr +
- btrfs_item_size_nr(path->nodes[0], path->slots[0]));
+ btrfs_item_size(path->nodes[0], path->slots[0]));
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
@@ -2383,7 +2390,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
/* If there is no inline backref, go search for keyed backref */
if (iter->cur_ptr >= iter->end_ptr) {
- ret = btrfs_next_item(fs_info->extent_root, path);
+ ret = btrfs_next_item(extent_root, path);
/* No inline nor keyed ref */
if (ret > 0) {
@@ -2404,7 +2411,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->item_ptr = iter->cur_ptr;
- iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
path->nodes[0], path->slots[0]));
}
@@ -2427,6 +2434,7 @@ release:
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
{
struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct btrfs_root *extent_root;
struct btrfs_path *path = iter->path;
struct btrfs_extent_inline_ref *iref;
int ret;
@@ -2457,7 +2465,8 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
/* We're at keyed items, there is no inline item, go to the next one */
- ret = btrfs_next_item(iter->fs_info->extent_root, iter->path);
+ extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+ ret = btrfs_next_item(extent_root, iter->path);
if (ret)
return ret;
@@ -2469,7 +2478,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
iter->cur_ptr = iter->item_ptr;
- iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
path->slots[0]);
return 0;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 444e9c89ff3e..1db24e6d6d90 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -514,7 +514,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -529,6 +529,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+ extent_root = btrfs_extent_root(fs_info, last);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -841,7 +842,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int ret;
- root = fs_info->extent_root;
+ root = btrfs_block_group_root(fs_info);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
@@ -1106,6 +1107,7 @@ out:
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -1139,8 +1141,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
num_items = 3 + map->num_stripes;
free_extent_map(em);
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items);
+ return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
/*
@@ -1508,7 +1509,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
- LIST_HEAD(again_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
@@ -1585,18 +1585,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret && ret != -EAGAIN)
+ if (ret)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
next:
+ btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
- if (ret == -EAGAIN && list_empty(&bg->bg_list))
- list_add_tail(&bg->bg_list, &again_list);
- else
- btrfs_put_block_group(bg);
}
- list_splice_tail(&again_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
@@ -1678,7 +1674,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
@@ -2165,6 +2161,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
+ struct btrfs_root *root = btrfs_block_group_root(info);
struct btrfs_path *path;
int ret;
struct btrfs_block_group *cache;
@@ -2173,7 +2170,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!info->extent_root)
+ if (!root)
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2276,7 +2273,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_item bgi;
- struct btrfs_root *root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
spin_lock(&block_group->lock);
@@ -2289,7 +2286,6 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- root = fs_info->extent_root;
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
@@ -2543,12 +2539,13 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
int ret;
bool dirty_bg_running;
do {
- trans = btrfs_join_transaction(fs_info->extent_root);
+ trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -2653,7 +2650,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
@@ -3790,7 +3787,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
}
if (!ret) {
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ ret = btrfs_block_rsv_add(fs_info,
&fs_info->chunk_block_rsv,
bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
@@ -3911,9 +3908,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
- spin_unlock(&info->unused_bgs_lock);
- spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->reclaim_bgs)) {
block_group = list_first_entry(&info->reclaim_bgs,
struct btrfs_block_group,
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 04a6226e0388..b3ee49b0b1e8 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,6 +6,7 @@
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
+#include "disk-io.h"
/*
* HOW DO BLOCK RESERVES WORK
@@ -208,7 +209,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
@@ -217,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -241,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush)
{
@@ -262,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -351,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
- u64 num_bytes;
- unsigned min_items;
+ struct btrfs_root *root, *tmp;
+ u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
+ unsigned int min_items = 1;
/*
* The global block rsv is based on the size of the extent tree, the
* checksum tree and the root tree. If the fs is empty we want to set
* it to a minimal amount for safety.
+ *
+ * We also are going to need to modify the minimum of the tree root and
+ * any global roots we could touch.
*/
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
- btrfs_root_used(&fs_info->csum_root->root_item) +
- btrfs_root_used(&fs_info->tree_root->root_item);
-
- /*
- * We at a minimum are going to modify the csum root, the tree root, and
- * the extent root.
- */
- min_items = 3;
+ read_lock(&fs_info->global_root_lock);
+ rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
+ rb_node) {
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ num_bytes += btrfs_root_used(&root->root_item);
+ min_items++;
+ }
+ }
+ read_unlock(&fs_info->global_root_lock);
/*
* But we also want to reserve enough space so we can do the fallback
@@ -412,6 +419,30 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_unlock(&sinfo->lock);
}
+void btrfs_init_root_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ switch (root->root_key.objectid) {
+ case BTRFS_CSUM_TREE_OBJECTID:
+ case BTRFS_EXTENT_TREE_OBJECTID:
+ case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ root->block_rsv = &fs_info->delayed_refs_rsv;
+ break;
+ case BTRFS_ROOT_TREE_OBJECTID:
+ case BTRFS_DEV_TREE_OBJECTID:
+ case BTRFS_QUOTA_TREE_OBJECTID:
+ root->block_rsv = &fs_info->global_block_rsv;
+ break;
+ case BTRFS_CHUNK_TREE_OBJECTID:
+ root->block_rsv = &fs_info->chunk_block_rsv;
+ break;
+ default:
+ root->block_rsv = NULL;
+ break;
+ }
+}
+
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -426,22 +457,6 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
- /*
- * Our various recovery options can leave us with NULL roots, so check
- * here and just bail before we go dereferencing NULLs everywhere.
- */
- if (!fs_info->extent_root || !fs_info->csum_root ||
- !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
- return;
-
- fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
- if (fs_info->quota_root)
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
btrfs_update_global_block_rsv(fs_info);
}
@@ -467,8 +482,9 @@ static struct btrfs_block_rsv *get_block_rsv(
struct btrfs_block_rsv *block_rsv = NULL;
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
- (root == fs_info->csum_root && trans->adding_csums) ||
- (root == fs_info->uuid_root))
+ (root == fs_info->uuid_root) ||
+ (trans->adding_csums &&
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -523,7 +539,7 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 0b6ae5302837..3b67ff08d434 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -50,6 +50,7 @@ struct btrfs_block_rsv {
};
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
@@ -57,11 +58,11 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ab2a4a52e0bb..b3e46aabc3d8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,19 +138,11 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- union {
- /*
- * Total number of bytes pending delalloc, used by stat to
- * calculate the real block usage of the file. This is used
- * only for files.
- */
- u64 delalloc_bytes;
- /*
- * The offset of the last dir item key that was logged.
- * This is used only for directories.
- */
- u64 last_dir_item_offset;
- };
+ /*
+ * Total number of bytes pending delalloc, used by stat to calculate the
+ * real block usage of the file. This is used only for files.
+ */
+ u64 delalloc_bytes;
union {
/*
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 32da97c3c19d..71e5b2e9a1ba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -96,10 +96,10 @@ static int compression_compress_pages(int type, struct list_head *ws,
}
}
-static int compression_decompress_bio(int type, struct list_head *ws,
- struct compressed_bio *cb)
+static int compression_decompress_bio(struct list_head *ws,
+ struct compressed_bio *cb)
{
- switch (type) {
+ switch (cb->compress_type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
@@ -157,7 +157,8 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
- if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
+ if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return 0;
shash->tfm = fs_info->csum_shash;
@@ -1359,7 +1360,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = compression_decompress_bio(type, workspace, cb);
+ ret = compression_decompress_bio(workspace, cb);
put_workspace(type, workspace);
return ret;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3983bdaf4b8..a7db3f6f1b7b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -463,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -485,8 +485,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
}
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -726,21 +726,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
}
/*
- * search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart.
+ * Search for a key in the given extent_buffer.
*
- * the slot in the array is returned via slot, and it points to
- * the place where you would insert key if it is not found in
- * the array.
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
*
- * Slot may point to total number of items if the key is bigger than
- * all of the keys
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
*/
-static noinline int generic_bin_search(struct extent_buffer *eb,
- unsigned long p, int item_size,
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
const struct btrfs_key *key, int *slot)
{
- int low = 0;
+ unsigned long p;
+ int item_size;
int high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
@@ -753,6 +755,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
return -EINVAL;
}
+ if (btrfs_header_level(eb) == 0) {
+ p = offsetof(struct btrfs_leaf, items);
+ item_size = sizeof(struct btrfs_item);
+ } else {
+ p = offsetof(struct btrfs_node, ptrs);
+ item_size = sizeof(struct btrfs_key_ptr);
+ }
+
while (low < high) {
unsigned long oip;
unsigned long offset;
@@ -791,20 +801,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
}
/*
- * simple bin_search frontend that does the right thing for
- * leaves vs nodes
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot)
{
- if (btrfs_header_level(eb) == 0)
- return generic_bin_search(eb,
- offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item), key, slot);
- else
- return generic_bin_search(eb,
- offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr), key, slot);
+ return generic_bin_search(eb, 0, key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -927,7 +930,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -986,7 +989,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+ 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
@@ -1031,7 +1035,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
@@ -1345,33 +1349,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
{
int i;
int skip_level = level;
- int no_skips = 0;
- struct extent_buffer *t;
+ bool check_skip = true;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
if (!path->locks[i])
break;
- if (!no_skips && path->slots[i] == 0) {
- skip_level = i + 1;
- continue;
- }
- if (!no_skips && path->keep_locks) {
- u32 nritems;
- t = path->nodes[i];
- nritems = btrfs_header_nritems(t);
- if (nritems < 1 || path->slots[i] >= nritems - 1) {
+
+ if (check_skip) {
+ if (path->slots[i] == 0) {
skip_level = i + 1;
continue;
}
+
+ if (path->keep_locks) {
+ u32 nritems;
+
+ nritems = btrfs_header_nritems(path->nodes[i]);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
}
- if (skip_level < i && i >= lowest_unlock)
- no_skips = 1;
- t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level) {
- btrfs_tree_unlock_rw(t, path->locks[i]);
+ check_skip = false;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -1567,35 +1572,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
- /*
- * The commit roots are read only so we always do read locks,
- * and we always must hold the commit_root_sem when doing
- * searches on them, the only exception is send where we don't
- * want to block transaction commits for a long time, so
- * we need to clone the commit root in order to avoid races
- * with transaction commits that create a snapshot of one of
- * the roots used by a send operation.
- */
- if (p->need_commit_sem) {
- down_read(&fs_info->commit_root_sem);
- b = btrfs_clone_extent_buffer(root->commit_root);
- up_read(&fs_info->commit_root_sem);
- if (!b)
- return ERR_PTR(-ENOMEM);
-
- } else {
- b = root->commit_root;
- atomic_inc(&b->refs);
- }
+ b = root->commit_root;
+ atomic_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -1612,6 +1595,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -1638,6 +1624,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -1647,6 +1644,191 @@ out:
return b;
}
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+ const int i = path->lowest_level;
+ const int slot = path->slots[i];
+ struct extent_buffer *lowest = path->nodes[i];
+ struct extent_buffer *clone;
+
+ ASSERT(path->need_commit_sem);
+
+ if (!lowest)
+ return 0;
+
+ lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+ clone = btrfs_clone_extent_buffer(lowest);
+ if (!clone)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+ path->nodes[i] = clone;
+ path->slots[i] = slot;
+
+ return 0;
+}
+
+static inline int search_for_key_slot(struct extent_buffer *eb,
+ int search_low_slot,
+ const struct btrfs_key *key,
+ int prev_cmp,
+ int *slot)
+{
+ /*
+ * If a previous call to btrfs_bin_search() on a parent node returned an
+ * exact match (prev_cmp == 0), we can safely assume the target key will
+ * always be at slot 0 on lower levels, since each key pointer
+ * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+ * subtree it points to. Thus we can skip searching lower levels.
+ */
+ if (prev_cmp == 0) {
+ *slot = 0;
+ return 0;
+ }
+
+ return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
+static int search_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *path,
+ int ins_len,
+ int prev_cmp)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int leaf_free_space = -1;
+ int search_low_slot = 0;
+ int ret;
+ bool do_bin_search = true;
+
+ /*
+ * If we are doing an insertion, the leaf has enough free space and the
+ * destination slot for the key is not slot 0, then we can unlock our
+ * write lock on the parent, and any other upper nodes, before doing the
+ * binary search on the leaf (with search_for_key_slot()), allowing other
+ * tasks to lock the parent and any other upper nodes.
+ */
+ if (ins_len > 0) {
+ /*
+ * Cache the leaf free space, since we will need it later and it
+ * will not change until then.
+ */
+ leaf_free_space = btrfs_leaf_free_space(leaf);
+
+ /*
+ * !path->locks[1] means we have a single node tree, the leaf is
+ * the root of the tree.
+ */
+ if (path->locks[1] && leaf_free_space >= ins_len) {
+ struct btrfs_disk_key first_key;
+
+ ASSERT(btrfs_header_nritems(leaf) > 0);
+ btrfs_item_key(leaf, &first_key, 0);
+
+ /*
+ * Doing the extra comparison with the first key is cheap,
+ * taking into account that the first key is very likely
+ * already in a cache line because it immediately follows
+ * the extent buffer's header and we have recently accessed
+ * the header's level field.
+ */
+ ret = comp_keys(&first_key, key);
+ if (ret < 0) {
+ /*
+ * The first key is smaller than the key we want
+ * to insert, so we are safe to unlock all upper
+ * nodes and we have to do the binary search.
+ *
+ * We do use btrfs_unlock_up_safe() and not
+ * unlock_up() because the later does not unlock
+ * nodes with a slot of 0 - we can safely unlock
+ * any node even if its slot is 0 since in this
+ * case the key does not end up at slot 0 of the
+ * leaf and there's no need to split the leaf.
+ */
+ btrfs_unlock_up_safe(path, 1);
+ search_low_slot = 1;
+ } else {
+ /*
+ * The first key is >= then the key we want to
+ * insert, so we can skip the binary search as
+ * the target key will be at slot 0.
+ *
+ * We can not unlock upper nodes when the key is
+ * less than the first key, because we will need
+ * to update the key at slot 0 of the parent node
+ * and possibly of other upper nodes too.
+ * If the key matches the first key, then we can
+ * unlock all the upper nodes, using
+ * btrfs_unlock_up_safe() instead of unlock_up()
+ * as stated above.
+ */
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
+ /*
+ * ret is already 0 or 1, matching the result of
+ * a btrfs_bin_search() call, so there is no need
+ * to adjust it.
+ */
+ do_bin_search = false;
+ path->slots[0] = 0;
+ }
+ }
+ }
+
+ if (do_bin_search) {
+ ret = search_for_key_slot(leaf, search_low_slot, key,
+ prev_cmp, &path->slots[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (ins_len > 0) {
+ /*
+ * Item key already exists. In this case, if we are allowed to
+ * insert the item (for example, in dir_item case, item key
+ * collision is allowed), it will be merged with the original
+ * item. Only the item size grows, no new btrfs item will be
+ * added. If search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf space
+ * check will be correct.
+ */
+ if (ret == 0 && !path->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
+
+ ASSERT(leaf_free_space >= 0);
+
+ if (leaf_free_space < ins_len) {
+ int err;
+
+ err = split_leaf(trans, root, key, path, ins_len,
+ (ret == 0));
+ ASSERT(err <= 0);
+ if (WARN_ON(err > 0))
+ err = -EUCLEAN;
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
@@ -1683,6 +1865,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -1724,6 +1907,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
min_write_lock_level = write_lock_level;
+ if (p->need_commit_sem) {
+ ASSERT(p->search_commit_root);
+ down_read(&fs_info->commit_root_sem);
+ }
+
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
@@ -1777,10 +1965,6 @@ again:
}
cow_done:
p->nodes[level] = b;
- /*
- * Leave path with blocking locks to avoid massive
- * lock context switch, this is made on purpose.
- */
/*
* we have a lock on b and as long as we aren't changing
@@ -1802,62 +1986,22 @@ cow_done:
}
}
- /*
- * If btrfs_bin_search returns an exact match (prev_cmp == 0)
- * we can safely assume the target key will always be in slot 0
- * on lower levels due to the invariants BTRFS' btree provides,
- * namely that a btrfs_key_ptr entry always points to the
- * lowest key in the child node, thus we can skip searching
- * lower levels
- */
- if (prev_cmp == 0) {
- slot = 0;
- ret = 0;
- } else {
- ret = btrfs_bin_search(b, key, &slot);
- prev_cmp = ret;
- if (ret < 0)
- goto done;
- }
-
if (level == 0) {
- p->slots[level] = slot;
- /*
- * Item key already exists. In this case, if we are
- * allowed to insert the item (for example, in dir_item
- * case, item key collision is allowed), it will be
- * merged with the original item. Only the item size
- * grows, no new btrfs item will be added. If
- * search_for_extension is not set, ins_len already
- * accounts the size btrfs_item, deduct it here so leaf
- * space check will be correct.
- */
- if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
- ASSERT(ins_len >= sizeof(struct btrfs_item));
- ins_len -= sizeof(struct btrfs_item);
- }
- if (ins_len > 0 &&
- btrfs_leaf_free_space(b) < ins_len) {
- if (write_lock_level < 1) {
- write_lock_level = 1;
- btrfs_release_path(p);
- goto again;
- }
+ if (ins_len > 0)
+ ASSERT(write_lock_level >= 1);
- err = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
-
- BUG_ON(err > 0);
- if (err) {
- ret = err;
- goto done;
- }
- }
+ ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL);
goto done;
}
+
+ ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
+ prev_cmp = ret;
+
if (ret && slot > 0) {
dec = 1;
slot--;
@@ -1918,6 +2062,16 @@ cow_done:
done:
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
+
+ if (p->need_commit_sem) {
+ int ret2;
+
+ ret2 = finish_need_commit_sem_search(p);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
@@ -2615,19 +2769,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
- struct btrfs_item *start_item;
- struct btrfs_item *end_item;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- start_item = btrfs_item_nr(start);
- end_item = btrfs_item_nr(end);
- data_len = btrfs_item_offset(l, start_item) +
- btrfs_item_size(l, start_item);
- data_len = data_len - btrfs_item_offset(l, end_item);
+ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+ data_len = data_len - btrfs_item_offset(l, end);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -2674,7 +2823,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
u32 i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 nr;
u32 right_nritems;
u32 data_end;
@@ -2691,8 +2839,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] > i)
break;
@@ -2707,12 +2853,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(left, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(left, i);
+ if (this_item_size + sizeof(struct btrfs_item) +
+ push_space > free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
if (i == 0)
break;
i--;
@@ -2726,7 +2873,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* push left to right */
right_nritems = btrfs_header_nritems(right);
- push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space = btrfs_item_data_end(left, left_nritems - push_items);
push_space -= leaf_data_end(left);
/* make room in the right data area */
@@ -2757,9 +2904,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(&token, item);
- btrfs_set_token_item_offset(&token, item, push_space);
+ push_space -= btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
left_nritems -= push_items;
@@ -2904,7 +3050,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 old_left_nritems;
u32 nr;
int ret = 0;
@@ -2918,8 +3063,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] < i)
break;
@@ -2934,12 +3077,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(right, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(right, i);
+ if (this_item_size + sizeof(struct btrfs_item) + push_space >
+ free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
}
if (push_items == 0) {
@@ -2955,25 +3099,23 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
push_items * sizeof(struct btrfs_item));
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
- btrfs_item_offset_nr(right, push_items - 1);
+ btrfs_item_offset(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset_nr(right, push_items - 1),
+ btrfs_item_offset(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
btrfs_init_map_token(&token, left);
- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
-
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item,
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -2984,7 +3126,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
right_nritems);
if (push_items < right_nritems) {
- push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
@@ -3002,10 +3144,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
-
- push_space = push_space - btrfs_token_item_size(&token, item);
- btrfs_set_token_item_offset(&token, item, push_space);
+ push_space = push_space - btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -3133,7 +3273,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
+ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
@@ -3144,15 +3284,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(l), data_copy_size);
- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -3268,7 +3407,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ if (extend && data_size + btrfs_item_size(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
@@ -3437,7 +3576,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -3457,7 +3596,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
leaf = path->nodes[0];
/* if our item isn't there, return now */
- if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ if (item_size != btrfs_item_size(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
@@ -3488,9 +3627,7 @@ static noinline int split_item(struct btrfs_path *path,
unsigned long split_offset)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
- struct btrfs_item *new_item;
- int slot;
+ int orig_slot, slot;
char *buf;
u32 nritems;
u32 item_size;
@@ -3500,9 +3637,9 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- item = btrfs_item_nr(path->slots[0]);
- orig_offset = btrfs_item_offset(leaf, item);
- item_size = btrfs_item_size(leaf, item);
+ orig_slot = path->slots[0];
+ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
buf = kmalloc(item_size, GFP_NOFS);
if (!buf)
@@ -3523,14 +3660,12 @@ static noinline int split_item(struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
- new_item = btrfs_item_nr(slot);
+ btrfs_set_item_offset(leaf, slot, orig_offset);
+ btrfs_set_item_size(leaf, slot, item_size - split_offset);
- btrfs_set_item_offset(leaf, new_item, orig_offset);
- btrfs_set_item_size(leaf, new_item, item_size - split_offset);
-
- btrfs_set_item_offset(leaf, item,
- orig_offset + item_size - split_offset);
- btrfs_set_item_size(leaf, item, split_offset);
+ btrfs_set_item_offset(leaf, orig_slot,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, orig_slot, split_offset);
btrfs_set_header_nritems(leaf, nritems + 1);
@@ -3591,7 +3726,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data_start;
@@ -3603,14 +3737,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
leaf = path->nodes[0];
slot = path->slots[0];
- old_size = btrfs_item_size_nr(leaf, slot);
+ old_size = btrfs_item_size(leaf, slot);
if (old_size == new_size)
return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- old_data_start = btrfs_item_offset_nr(leaf, slot);
+ old_data_start = btrfs_item_offset(leaf, slot);
size_diff = old_size - new_size;
@@ -3624,10 +3758,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + size_diff);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
}
/* shift the data */
@@ -3670,8 +3803,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
fixup_low_keys(path, &disk_key, 1);
}
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, new_size);
+ btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3687,7 +3819,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data;
@@ -3705,7 +3836,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
BUG();
}
slot = path->slots[0];
- old_data = btrfs_item_end_nr(leaf, slot);
+ old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
if (slot >= nritems) {
@@ -3722,10 +3853,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff - data_size);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff - data_size);
}
/* shift the data */
@@ -3734,9 +3864,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end, old_data - data_end);
data_end = old_data;
- old_size = btrfs_item_size_nr(leaf, slot);
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, old_size + data_size);
+ old_size = btrfs_item_size(leaf, slot);
+ btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3758,7 +3887,6 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
@@ -3795,7 +3923,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+ unsigned int old_data = btrfs_item_data_end(leaf, slot);
if (old_data < data_end) {
btrfs_print_leaf(leaf);
@@ -3811,10 +3939,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item,
- ioff - batch->total_data_size);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - batch->total_data_size);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
@@ -3833,10 +3960,9 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
for (i = 0; i < batch->nr; i++) {
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(slot + i);
data_end -= batch->data_sizes[i];
- btrfs_set_token_item_offset(&token, item, data_end);
- btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
+ btrfs_set_token_item_offset(&token, slot + i, data_end);
+ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
@@ -3943,7 +4069,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
u32 item_size;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ret = setup_leaf_for_split(trans, root, path,
item_size + sizeof(struct btrfs_item));
if (ret)
@@ -4032,7 +4158,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4044,7 +4170,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 last_off;
u32 dsize = 0;
int ret = 0;
@@ -4053,10 +4178,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+ last_off = btrfs_item_offset(leaf, slot + nr - 1);
for (i = 0; i < nr; i++)
- dsize += btrfs_item_size_nr(leaf, slot + i);
+ dsize += btrfs_item_size(leaf, slot + i);
nritems = btrfs_header_nritems(leaf);
@@ -4073,9 +4198,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(&token, item);
- btrfs_set_token_item_offset(&token, item, ioff + dsize);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -4402,7 +4526,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int level;
struct extent_buffer *c;
struct extent_buffer *next;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
+ bool need_commit_sem = false;
u32 nritems;
int ret;
int i;
@@ -4419,14 +4545,20 @@ again:
path->keep_locks = 1;
- if (time_seq)
+ if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
- else
+ } else {
+ if (path->need_commit_sem) {
+ path->need_commit_sem = 0;
+ need_commit_sem = true;
+ down_read(&fs_info->commit_root_sem);
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ }
path->keep_locks = 0;
if (ret < 0)
- return ret;
+ goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
@@ -4549,6 +4681,15 @@ again:
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
+ if (need_commit_sem) {
+ int ret2;
+
+ path->need_commit_sem = 1;
+ ret2 = finish_need_commit_sem_search(path);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7553e9dc5f93..b4a9b1c58d22 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -143,6 +143,8 @@ enum {
BTRFS_FS_STATE_DEV_REPLACING,
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
};
#define BTRFS_BACKREF_REV_MAX 256
@@ -511,11 +513,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-enum btrfs_orphan_cleanup_state {
- ORPHAN_CLEANUP_STARTED = 1,
- ORPHAN_CLEANUP_DONE = 2,
-};
-
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
@@ -553,7 +550,6 @@ struct btrfs_swapfile_pin {
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
enum {
- BTRFS_FS_BARRIER,
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
BTRFS_FS_LOG_RECOVERING,
@@ -576,7 +572,6 @@ enum {
/*
* Indicate that relocation of a chunk has started, it's set per chunk
* and is toggled between chunks.
- * Set, tested and cleared while holding fs_info::send_reloc_lock.
*/
BTRFS_FS_RELOC_RUNNING,
@@ -601,6 +596,9 @@ enum {
/* Indicate whether there are any tree modification log users */
BTRFS_FS_TREE_MOD_LOG_USERS,
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -613,6 +611,7 @@ enum {
*/
enum btrfs_exclusive_operation {
BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
BTRFS_EXCLOP_BALANCE,
BTRFS_EXCLOP_DEV_ADD,
BTRFS_EXCLOP_DEV_REMOVE,
@@ -624,20 +623,21 @@ enum btrfs_exclusive_operation {
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
- struct btrfs_root *extent_root;
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
- struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
- struct btrfs_root *free_space_root;
struct btrfs_root *data_reloc_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
@@ -673,6 +673,12 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
u64 avg_delayed_ref_runtime;
/*
@@ -815,7 +821,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
- struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@@ -952,13 +957,6 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* readahead tree */
- spinlock_t reada_lock;
- struct radix_tree_root reada_tree;
-
- /* readahead works cnt */
- atomic_t reada_works_cnt;
-
/* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
@@ -1003,13 +1001,6 @@ struct btrfs_fs_info {
struct crypto_shash *csum_shash;
- spinlock_t send_reloc_lock;
- /*
- * Number of send operations in progress.
- * Updated while holding fs_info::send_reloc_lock.
- */
- int send_in_progress;
-
/* Type of exclusive operation running, protected by super_lock */
enum btrfs_exclusive_operation exclusive_operation;
@@ -1110,6 +1101,8 @@ enum {
BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING,
+ /* We started the orphan cleanup for this root. */
+ BTRFS_ROOT_ORPHAN_CLEANUP,
};
/*
@@ -1128,6 +1121,8 @@ struct btrfs_qgroup_swapped_blocks {
* and for the extent tree extent_root root.
*/
struct btrfs_root {
+ struct rb_node rb_node;
+
struct extent_buffer *node;
struct extent_buffer *commit_root;
@@ -1178,8 +1173,6 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
- int orphan_cleanup_state;
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
@@ -1960,8 +1953,8 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb,
}
/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
@@ -1976,25 +1969,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
- struct btrfs_item *item)
-{
- return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
+ int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
}
-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_end(eb, btrfs_item_nr(nr));
-}
-
-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_offset(eb, btrfs_item_nr(nr));
-}
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
{
- return btrfs_item_size(eb, btrfs_item_nr(nr));
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
}
static inline void btrfs_item_key(const struct extent_buffer *eb,
@@ -2257,6 +2261,11 @@ static inline bool btrfs_root_dead(const struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2458,7 +2467,7 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
if (nr == 0)
return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
- return btrfs_item_offset_nr(leaf, nr - 1);
+ return btrfs_item_offset(leaf, nr - 1);
}
/* struct btrfs_file_extent_item */
@@ -2517,9 +2526,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
*/
static inline u32 btrfs_file_extent_inline_item_len(
const struct extent_buffer *eb,
- struct btrfs_item *e)
+ int nr)
{
- return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* btrfs_qgroup_status_item */
@@ -2611,11 +2620,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
#define btrfs_item_ptr_offset(leaf, slot) \
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
@@ -2719,7 +2728,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 empty_size,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -3114,36 +3123,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-item.c */
-int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 index);
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path,
- struct btrfs_key *location, int mod);
-
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
-
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len);
-struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len);
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
@@ -3203,10 +3182,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int front);
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode, u64 new_size,
- u32 min_type, u64 *extents_found);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -3305,6 +3280,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3821,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
btrfs_bio_counter_sub(fs_info, 1);
}
-/* reada.c */
-struct reada_control {
- struct btrfs_fs_info *fs_info; /* tree to prefetch */
- struct btrfs_key key_start;
- struct btrfs_key key_end; /* exclusive */
- atomic_t elems;
- struct kref refcnt;
- wait_queue_head_t wait;
-};
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *start, struct btrfs_key *end);
-int btrfs_reada_wait(void *handle);
-void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct extent_buffer *eb, int err);
-void btrfs_reada_remove_dev(struct btrfs_device *dev);
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2059d1504149..fb46a28f5065 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -143,10 +143,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
- if (ret < 0)
+ if (ret < 0) {
btrfs_free_reserved_data_space_noquota(fs_info, len);
- else
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ } else {
ret = 0;
+ }
return ret;
}
@@ -331,7 +334,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
@@ -452,8 +455,11 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len);
- if (ret < 0)
+ if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ }
return ret;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e164766dcc38..748bf6b0d860 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -13,6 +13,7 @@
#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
+#include "inode-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -629,7 +630,7 @@ static int btrfs_delayed_inode_reserve_metadata(
BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
/* NO_FLUSH could only fail with -ENOSPC */
ASSERT(ret == 0 || ret == -ENOSPC);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cca7e85e32dd..4176df149d04 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -84,6 +84,17 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -108,6 +119,17 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
num_bytes = btrfs_calc_insert_metadata_size(fs_info,
trans->delayed_ref_updates);
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
delayed_rsv->full = 0;
@@ -191,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c85a7d44da79..62b9651ea662 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -128,7 +128,7 @@ no_valid_dev_replace_entry_found:
}
slot = path->slots[0];
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
@@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error;
@@ -381,7 +381,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/*
* need to delete old one and insert a new one.
* Since no attempt is made to recover any old state, if the
@@ -906,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- if (!scrub_ret)
- btrfs_reada_remove_dev(src_device);
-
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
@@ -917,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
@@ -968,7 +964,6 @@ error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_reada_undo_remove_dev(src_device);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7721ce0c0604..3b532bab0755 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
- struct btrfs_item *item;
struct extent_buffer *leaf;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
- BUG_ON(data_size > btrfs_item_size(leaf, item));
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
+ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
return (struct btrfs_dir_item *)ptr;
}
@@ -271,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
data_size = sizeof(*di) + name_len;
leaf = path->nodes[0];
slot = path->slots[0];
- if (data_size + btrfs_item_size_nr(leaf, slot) +
+ if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
ret = -EOVERFLOW;
} else {
@@ -409,7 +407,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ total_len = btrfs_item_size(leaf, path->slots[0]);
while (cur < total_len) {
this_len = sizeof(*dir_item) +
btrfs_dir_name_len(leaf, dir_item) +
@@ -445,7 +443,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di);
- item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_len = btrfs_item_size(leaf, path->slots[0]);
if (sub_item_len == item_len) {
ret = btrfs_del_item(trans, root, path);
} else {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59c3be8c1f4c..87a5addbedf6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
if (ret < 0)
goto err;
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
set_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
@@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
}
ret = validate_extent_buffer(eb);
err:
- if (reads_done &&
- test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
if (ret) {
/*
* our io error hook is going to dec the io pages
@@ -1140,11 +1133,16 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
root->fs_info = fs_info;
+ root->root_key.objectid = objectid;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
- root->orphan_cleanup_state = 0;
+ RB_CLEAR_NODE(&root->rb_node);
root->last_trans = 0;
root->free_objectid = 0;
@@ -1152,7 +1150,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
- root->block_rsv = NULL;
+
+ btrfs_init_root_block_rsv(root);
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
@@ -1190,6 +1189,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
+ root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
@@ -1197,12 +1197,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
IO_TREE_LOG_CSUM_RANGE, NULL);
}
- memset(&root->root_key, 0, sizeof(root->root_key));
- memset(&root->root_item, 0, sizeof(root->root_item));
- memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- root->root_key.objectid = objectid;
- root->anon_dev = 0;
-
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
@@ -1242,6 +1236,81 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
}
#endif
+static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
+{
+ const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
+ const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
+}
+
+static int global_root_key_cmp(const void *k, const struct rb_node *node)
+{
+ const struct btrfs_key *key = k;
+ const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(key, &root->root_key);
+}
+
+int btrfs_global_root_insert(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *tmp;
+
+ write_lock(&fs_info->global_root_lock);
+ tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
+ write_unlock(&fs_info->global_root_lock);
+ ASSERT(!tmp);
+
+ return tmp ? -EEXIST : 0;
+}
+
+void btrfs_global_root_delete(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ write_lock(&fs_info->global_root_lock);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ write_unlock(&fs_info->global_root_lock);
+}
+
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key)
+{
+ struct rb_node *node;
+ struct btrfs_root *root = NULL;
+
+ read_lock(&fs_info->global_root_lock);
+ node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
+ if (node)
+ root = container_of(node, struct btrfs_root, rb_node);
+ read_unlock(&fs_info->global_root_lock);
+
+ return root;
+}
+
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_CSUM_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_EXTENT_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1554,25 +1623,33 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
if (objectid == BTRFS_ROOT_TREE_OBJECTID)
return btrfs_grab_root(fs_info->tree_root);
if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->extent_root);
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
return btrfs_grab_root(fs_info->chunk_root);
if (objectid == BTRFS_DEV_TREE_OBJECTID)
return btrfs_grab_root(fs_info->dev_root);
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->csum_root);
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
return btrfs_grab_root(fs_info->quota_root) ?
fs_info->quota_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
- if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return btrfs_grab_root(fs_info->free_space_root) ?
- fs_info->free_space_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ struct btrfs_root *root = btrfs_global_root(fs_info, &key);
+
+ return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
+ }
return NULL;
}
@@ -1619,6 +1696,18 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
#endif
}
+static void free_global_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct rb_node *node;
+
+ while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
+ root = rb_entry(node, struct btrfs_root, rb_node);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ btrfs_put_root(root);
+ }
+}
+
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
@@ -1630,14 +1719,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_free_ref_cache(fs_info);
kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
- btrfs_put_root(fs_info->extent_root);
+ free_global_roots(fs_info);
btrfs_put_root(fs_info->tree_root);
btrfs_put_root(fs_info->chunk_root);
btrfs_put_root(fs_info->dev_root);
- btrfs_put_root(fs_info->csum_root);
btrfs_put_root(fs_info->quota_root);
btrfs_put_root(fs_info->uuid_root);
- btrfs_put_root(fs_info->free_space_root);
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
btrfs_check_leaked_roots(fs_info);
@@ -1732,6 +1819,14 @@ again:
}
return root;
fail:
+ /*
+ * If our caller provided us an anonymous device, then it's his
+ * responsability to free it in case we fail. So we have to set our
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
+ * and once again by our caller.
+ */
+ if (anon_dev)
+ root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1927,7 +2022,8 @@ static int transaction_kthread(void *arg)
}
delta = ktime_get_seconds() - cur->start_time;
- if (cur->state < TRANS_STATE_COMMIT_START &&
+ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
+ cur->state < TRANS_STATE_COMMIT_START &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
delay -= msecs_to_jiffies((delta - 1) * 1000);
@@ -1999,6 +2095,8 @@ static void backup_super_roots(struct btrfs_fs_info *info)
{
const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
root_backup = info->super_for_commit->super_roots + next_backup;
@@ -2023,11 +2121,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ btrfs_set_backup_extent_root(root_backup, extent_root->node->start);
btrfs_set_backup_extent_root_gen(root_backup,
- btrfs_header_generation(info->extent_root->node));
+ btrfs_header_generation(extent_root->node));
btrfs_set_backup_extent_root_level(root_backup,
- btrfs_header_level(info->extent_root->node));
+ btrfs_header_level(extent_root->node));
/*
* we might commit during log recovery, which happens before we set
@@ -2048,11 +2146,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
- btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
btrfs_set_backup_csum_root_gen(root_backup,
- btrfs_header_generation(info->csum_root->node));
+ btrfs_header_generation(csum_root->node));
btrfs_set_backup_csum_root_level(root_backup,
- btrfs_header_level(info->csum_root->node));
+ btrfs_header_level(csum_root->node));
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
@@ -2127,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
- btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers)
@@ -2151,21 +2248,29 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
}
+static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(root, tmp,
+ &fs_info->global_root_tree,
+ rb_node)
+ free_root_extent_buffers(root);
+}
+
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
+ free_global_root_pointers(info);
free_root_extent_buffers(info->dev_root);
- free_root_extent_buffers(info->extent_root);
- free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
- free_root_extent_buffers(info->free_space_root);
}
void btrfs_put_root(struct btrfs_root *root)
@@ -2283,8 +2388,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->qgroup_rescan_lock);
}
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices)
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
@@ -2333,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->delayed_workers =
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0);
- fs_info->readahead_workers =
- btrfs_alloc_workqueue(fs_info, "readahead", flags,
- max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers =
@@ -2347,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->readahead_workers &&
- fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->qgroup_rescan_workers &&
+ fs_info->caching_workers && fs_info->fixup_workers &&
+ fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@@ -2427,6 +2527,104 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return 0;
}
+static int load_global_roots_objectid(struct btrfs_root *tree_root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name)
+{
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *root;
+ int ret;
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ bool found = false;
+
+ /* If we have IGNOREDATACSUMS skip loading these roots. */
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
+ btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ return 0;
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+ ret = 0;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != objectid)
+ break;
+ btrfs_release_path(path);
+
+ found = true;
+ root = read_tree_root_path(tree_root, path, &key);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = PTR_ERR(root);
+ break;
+ }
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ ret = btrfs_global_root_insert(root);
+ if (ret) {
+ btrfs_put_root(root);
+ break;
+ }
+ key.offset++;
+ }
+ btrfs_release_path(path);
+
+ if (!found || ret) {
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = ret ? ret : -ENOENT;
+ else
+ ret = 0;
+ btrfs_err(fs_info, "failed to load root %s", name);
+ }
+ return ret;
+}
+
+static int load_global_roots(struct btrfs_root *tree_root)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_EXTENT_TREE_OBJECTID, "extent");
+ if (ret)
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_CSUM_TREE_OBJECTID, "csum");
+ if (ret)
+ goto out;
+ if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_FREE_SPACE_TREE_OBJECTID,
+ "free space");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -2436,7 +2634,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
BUG_ON(!fs_info->tree_root);
- location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ ret = load_global_roots(tree_root);
+ if (ret)
+ return ret;
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
@@ -2448,38 +2650,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
- }
-
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
btrfs_init_devices_late(fs_info);
- /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
- if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
- }
- }
-
/*
* This tree can share blocks with some other fs tree during relocation
* and we need a proper setup by btrfs_get_fs_root
@@ -2517,20 +2692,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
- }
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
- }
- }
-
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2850,6 +3011,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
/* All successful */
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
+ fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
if (backup_index < 0) {
@@ -2885,6 +3047,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->zone_active_bgs_lock);
spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
+ rwlock_init(&fs_info->global_root_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
@@ -2916,9 +3079,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
- atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->global_root_tree = RB_ROOT;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
@@ -2926,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
- /* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2950,7 +3110,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
- set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -2985,9 +3144,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
- spin_lock_init(&fs_info->send_reloc_lock);
- fs_info->send_in_progress = 0;
-
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
@@ -3415,7 +3571,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->subpage_info = subpage_info;
}
- ret = btrfs_init_workqueues(fs_info, fs_devices);
+ ret = btrfs_init_workqueues(fs_info);
if (ret) {
err = ret;
goto fail_sb_buffer;
@@ -3563,6 +3719,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
+ btrfs_free_zone_cache(fs_info);
+
if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
@@ -3978,11 +4136,23 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio = device->flush_bio;
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
+#endif
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
@@ -4313,6 +4483,48 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
return btrfs_commit_transaction(trans);
}
+static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_transaction *trans;
+ struct btrfs_transaction *tmp;
+ bool found = false;
+
+ if (list_empty(&fs_info->trans_list))
+ return;
+
+ /*
+ * This function is only called at the very end of close_ctree(),
+ * thus no other running transaction, no need to take trans_lock.
+ */
+ ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
+ list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
+ struct extent_state *cached = NULL;
+ u64 dirty_bytes = 0;
+ u64 cur = 0;
+ u64 found_start;
+ u64 found_end;
+
+ found = true;
+ while (!find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ dirty_bytes += found_end + 1 - found_start;
+ cur = found_end + 1;
+ }
+ btrfs_warn(fs_info,
+ "transaction %llu (with %llu dirty metadata bytes) is not committed",
+ trans->transid, dirty_bytes);
+ btrfs_cleanup_one_transaction(trans, fs_info);
+
+ if (trans == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ list_del_init(&trans->list);
+
+ btrfs_put_transaction(trans);
+ trace_btrfs_transaction_commit(fs_info);
+ }
+ ASSERT(!found);
+}
+
void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -4421,7 +4633,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_stop_all_workers(fs_info);
/* We shouldn't have any transaction open at this point */
- ASSERT(list_empty(&fs_info->trans_list));
+ warn_about_uncommitted_trans(fs_info);
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
@@ -4969,7 +5181,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(t);
- trace_btrfs_transaction_commit(fs_info->tree_root);
+ trace_btrfs_transaction_commit(fs_info);
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a2b5db4ba262..5e8bef4b7563 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,12 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
+int btrfs_global_root_insert(struct btrfs_root *root);
+void btrfs_global_root_delete(struct btrfs_root *root);
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key);
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@@ -103,6 +109,11 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
return NULL;
}
+static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_extent_root(fs_info, 0);
+}
+
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3fd736a02c1e..d89273c4b6b8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -87,6 +87,7 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, start);
int ret;
struct btrfs_key key;
struct btrfs_path *path;
@@ -98,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
key.objectid = start;
key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
btrfs_free_path(path);
return ret;
}
@@ -116,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags)
{
+ struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
@@ -153,7 +155,8 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ extent_root = btrfs_extent_root(fs_info, bytenr);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
@@ -171,7 +174,7 @@ search_again:
if (ret == 0) {
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (item_size >= sizeof(*ei)) {
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
@@ -443,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
u64 root_objectid,
u64 owner, u64 offset)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
@@ -519,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner,
u64 offset, int refs_to_add)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
u32 size;
@@ -593,6 +596,7 @@ fail:
}
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
int refs_to_drop, int *last_ref)
{
@@ -626,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
num_refs -= refs_to_drop;
if (num_refs == 0) {
- ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
+ ret = btrfs_del_item(trans, root, path);
*last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
@@ -685,7 +689,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 parent,
u64 root_objectid)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
@@ -709,6 +713,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 parent,
u64 root_objectid)
{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
@@ -721,8 +726,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
key.offset = root_objectid;
}
- ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
- path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_release_path(path);
return ret;
}
@@ -787,7 +791,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 owner, u64 offset, int insert)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -865,7 +869,7 @@ again:
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
btrfs_print_v0_err(fs_info);
@@ -1007,7 +1011,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
__run_delayed_extent_op(extent_op, leaf, ei);
ptr = (unsigned long)ei + item_offset;
- end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+ end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
if (ptr < end - size)
memmove_extent_buffer(leaf, ptr + size, ptr,
end - size - ptr);
@@ -1119,7 +1123,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
} else {
*last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = (unsigned long)iref;
end = (unsigned long)ei + item_size;
if (ptr + size < end)
@@ -1174,6 +1178,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
}
static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_drop, int is_data, int *last_ref)
@@ -1185,11 +1190,11 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
last_ref);
} else if (is_data) {
- ret = remove_extent_data_ref(trans, path, refs_to_drop,
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
last_ref);
} else {
*last_ref = 1;
- ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
+ ret = btrfs_del_item(trans, root, path);
}
return ret;
}
@@ -1572,6 +1577,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -1601,8 +1607,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
key.offset = head->num_bytes;
}
+ root = btrfs_extent_root(fs_info, key.objectid);
again:
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
goto out;
@@ -1634,7 +1641,7 @@ again:
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
@@ -1844,8 +1851,11 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved) {
btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info->csum_root,
- head->bytenr, head->num_bytes);
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(fs_info, head->bytenr);
+ ret = btrfs_del_csums(trans, csum_root, head->bytenr,
+ head->num_bytes);
}
}
@@ -2285,7 +2295,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct extent_buffer *leaf;
struct btrfs_extent_data_ref *ref;
struct btrfs_extent_inline_ref *iref;
@@ -2316,7 +2326,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
goto out;
ret = 1;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
/* If extent item has more than 1 inline ref then it's shared */
@@ -2920,7 +2930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_root *extent_root = info->extent_root;
+ struct btrfs_root *extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -2936,6 +2946,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int last_ref = 0;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+ extent_root = btrfs_extent_root(info, bytenr);
+ ASSERT(extent_root);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2996,9 +3009,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto err_dump;
}
/* Must be SHARED_* item, remove the backref first */
- ret = remove_extent_backref(trans, path, NULL,
- refs_to_drop,
- is_data, &last_ref);
+ ret = remove_extent_backref(trans, extent_root, path,
+ NULL, refs_to_drop, is_data,
+ &last_ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3068,7 +3081,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, extent_slot);
+ item_size = btrfs_item_size(leaf, extent_slot);
if (unlikely(item_size < sizeof(*ei))) {
ret = -EINVAL;
btrfs_print_v0_err(info);
@@ -3122,8 +3135,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
}
if (found_extent) {
- ret = remove_extent_backref(trans, path, iref,
- refs_to_drop, is_data,
+ ret = remove_extent_backref(trans, extent_root, path,
+ iref, refs_to_drop, is_data,
&last_ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3179,7 +3192,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (is_data) {
- ret = btrfs_del_csums(trans, info->csum_root, bytenr,
+ struct btrfs_root *csum_root;
+ csum_root = btrfs_csum_root(info, bytenr);
+ ret = btrfs_del_csums(trans, csum_root, bytenr,
num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3275,20 +3290,20 @@ out_delayed_unlock:
}
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_ref generic_ref = { 0 };
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root->root_key.objectid, 0, false);
+ root_id, 0, false);
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
@@ -3298,7 +3313,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache;
bool must_pin = false;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
if (!ret) {
btrfs_redirty_list_add(trans->transaction, buf);
@@ -3790,23 +3805,35 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
spin_unlock(&fs_info->relocation_bg_lock);
if (skip)
return 1;
+
/* Check RO and no space case before trying to activate it */
spin_lock(&block_group->lock);
if (block_group->ro ||
block_group->alloc_offset == block_group->zone_capacity) {
- spin_unlock(&block_group->lock);
- return 1;
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
}
spin_unlock(&block_group->lock);
- if (!btrfs_zone_activate(block_group))
- return 1;
+ if (!ret && !btrfs_zone_activate(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
spin_lock(&fs_info->relocation_bg_lock);
+ if (ret)
+ goto out;
+
ASSERT(!ffe_ctl->for_treelog ||
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
@@ -3947,6 +3974,28 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
}
}
+static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return true;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /*
+ * If we have enough free space left in an already
+ * active block group and we can't activate any other
+ * zone now, do not allow allocating a new chunk and
+ * let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return false;
+ return true;
+ default:
+ BUG();
+ }
+}
+
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
@@ -3975,7 +4024,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
bool full_search)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->chunk_root;
int ret;
if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
@@ -3987,18 +4036,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return 0;
}
- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
- /*
- * If we have enough free space left in an already active block
- * group and we can't activate any other zone now, retry the
- * active ones with a smaller allocation size. Returning early
- * from here will tell btrfs_reserve_extent() to haven the
- * size.
- */
- return -ENOSPC;
- }
-
if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
return 1;
@@ -4034,6 +4071,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
int exist = 0;
+ /*Check if allocation policy allows to create a new chunk */
+ if (!can_allocate_chunk(fs_info, ffe_ctl))
+ return -ENOSPC;
+
trans = current->journal_info;
if (trans)
exist = 1;
@@ -4570,6 +4611,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins, int ref_mod)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_inline_ref *iref;
@@ -4589,8 +4631,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- ins, size);
+ extent_root = btrfs_extent_root(fs_info, ins->objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -4642,6 +4684,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_key extent_key;
@@ -4673,8 +4716,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- &extent_key, size);
+ extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
+ size);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -5472,7 +5516,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
@@ -6051,6 +6096,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
int dev_ret = 0;
int ret = 0;
+ if (range->start == U64_MAX)
+ return -EINVAL;
+
/*
* Check range overflow if range->len is set.
* The default range->len is U64_MAX.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4e03a6d3aa32..d6d48ecf823c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2314,8 +2314,8 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
- if (btrfs_is_zoned(fs_info))
- return btrfs_repair_one_zone(fs_info, logical);
+ if (btrfs_repair_one_zone(fs_info, logical))
+ return 0;
bio = btrfs_bio_alloc(1);
bio->bi_iter.bi_size = 0;
@@ -3087,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror;
atomic_dec(&eb->io_pages);
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
- &eb->bflags))
- btree_readahead_hook(eb, -EIO);
}
readpage_ok:
if (likely(uptodate)) {
@@ -3187,13 +3184,12 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
/**
* Attempt to add a page to bio
*
- * @bio: destination bio
+ * @bio_ctrl: record both the bio, and its bio_flags
* @page: page to add to the bio
* @disk_bytenr: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @pg_offset: starting offset in the page
* @size: portion of page that we want to write
- * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @pg_offset: starting offset in the page
* @bio_flags: flags of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
@@ -3283,8 +3279,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
else
bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
- if (!btrfs_is_zoned(fs_info) ||
- bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+ if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
return 0;
}
@@ -3339,7 +3334,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
}
- if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct btrfs_device *device;
device = btrfs_zoned_get_device(fs_info, disk_bytenr,
@@ -3785,12 +3780,13 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
- struct page *page, struct writeback_control *wbc,
- unsigned long *nr_written)
+ struct page *page, struct writeback_control *wbc)
{
const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
u64 delalloc_start = page_offset(page);
u64 delalloc_to_write = 0;
+ /* How many pages are started by btrfs_run_delalloc_range() */
+ unsigned long nr_written = 0;
int ret;
int page_started = 0;
@@ -3806,7 +3802,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
continue;
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, &page_started, nr_written, wbc);
+ delalloc_end, &page_started, &nr_written, wbc);
if (ret) {
btrfs_page_set_error(inode->root->fs_info, page,
page_offset(page), PAGE_SIZE);
@@ -3829,16 +3825,13 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
thresh);
}
- /* did the fill delalloc function already unlock and start
- * the IO?
- */
+ /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
if (page_started) {
/*
- * we've unlocked the page, so we can't update
- * the mapping's writeback index, just update
- * nr_to_write.
+ * We've unlocked the page, so we can't update the mapping's
+ * writeback index, just update nr_to_write.
*/
- wbc->nr_to_write -= *nr_written;
+ wbc->nr_to_write -= nr_written;
return 1;
}
@@ -3910,7 +3903,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct writeback_control *wbc,
struct extent_page_data *epd,
loff_t i_size,
- unsigned long nr_written,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -3929,7 +3921,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
- update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
}
@@ -3938,7 +3929,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(wbc, nr_written + 1);
+ update_nr_written(wbc, 1);
while (cur <= end) {
u64 disk_bytenr;
@@ -4076,7 +4067,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned long nr_written = 0;
trace___extent_writepage(page, inode, wbc);
@@ -4105,7 +4095,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (!epd->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
if (ret == 1)
return 0;
if (ret)
@@ -4113,7 +4103,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
- nr_written, &nr);
+ &nr);
if (ret == 1)
return 0;
@@ -4314,6 +4304,20 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
+ * We need to set the mapping with the io error as well because a write
+ * error will flip the file system readonly, and then syncfs() will
+ * return a 0 because we are readonly if we don't modify the err seq for
+ * the superblock.
+ */
+ mapping_set_error(page->mapping, -EIO);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
@@ -5175,8 +5179,6 @@ int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
- const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
int ret = 0;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
@@ -5188,11 +5190,9 @@ int extent_writepages(struct address_space *mapping,
* Allow only a single thread to do the reloc work in zoned mode to
* protect the write pointer updates.
*/
- if (data_reloc && zoned)
- btrfs_inode_lock(inode, 0);
+ btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- if (data_reloc && zoned)
- btrfs_inode_unlock(inode, 0);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
@@ -6597,6 +6597,14 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
+ /*
+ * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
+ * operation, which could potentially still be in flight. In this case
+ * we simply want to return an error.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
+ return -EIO;
+
if (eb->fs_info->sectorsize < PAGE_SIZE)
return read_extent_buffer_subpage(eb, wait, mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d1cbb64a78f3..90c5c38836ab 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -208,7 +208,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
csum_offset = (bytenr - found_key.offset) >>
fs_info->sectorsize_bits;
- csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+ csums_in_item = btrfs_item_size(leaf, path->slots[0]);
csums_in_item /= csum_size;
if (csum_offset == csums_in_item) {
@@ -257,6 +257,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 disk_bytenr,
u64 len, u8 *dst)
{
+ struct btrfs_root *csum_root;
struct btrfs_csum_item *item = NULL;
struct btrfs_key key;
const u32 sectorsize = fs_info->sectorsize;
@@ -274,7 +275,7 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
@@ -285,13 +286,14 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
/* Current item doesn't contain the desired range, search again */
btrfs_release_path(path);
- item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0);
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
@@ -376,7 +378,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
- if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
return BLK_STS_OK;
/*
@@ -534,7 +537,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
key.type == BTRFS_EXTENT_CSUM_KEY) {
offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
- btrfs_item_size_nr(leaf, path->slots[0] - 1))
+ btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
@@ -559,7 +562,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.offset > start)
start = key.offset;
- size = btrfs_item_size_nr(leaf, path->slots[0]);
+ size = btrfs_item_size(leaf, path->slots[0]);
csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
if (csum_end <= start) {
path->slots[0]++;
@@ -750,7 +753,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u32 blocksize_bits = fs_info->sectorsize_bits;
leaf = path->nodes[0];
- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key->offset;
@@ -801,7 +804,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
- ASSERT(root == fs_info->csum_root ||
+ ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
@@ -834,7 +837,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
if (key.offset >= end_byte)
break;
- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key.offset;
@@ -1002,7 +1005,7 @@ again:
item_end = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((char *)item_end +
- btrfs_item_size_nr(leaf, path->slots[0]));
+ btrfs_item_size(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
@@ -1013,7 +1016,7 @@ again:
u32 item_size;
/* we found one, but it isn't big enough yet */
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if ((item_size / csum_size) >=
MAX_CSUM_ITEMS(fs_info, csum_size)) {
/* already at max size, make a new one */
@@ -1070,7 +1073,7 @@ again:
}
extend_csum:
- if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
+ if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
csum_size) {
int extend_nr;
u64 tmp;
@@ -1125,7 +1128,7 @@ extend_csum:
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
- diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+ diff = diff - btrfs_item_size(leaf, path->slots[0]);
diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
diff /= csum_size;
diff *= csum_size;
@@ -1162,7 +1165,7 @@ insert:
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((unsigned char *)item +
- btrfs_item_size_nr(leaf, path->slots[0]));
+ btrfs_item_size(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f3fee88c8ee0..01a408db5683 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,6 +23,7 @@
#include "block-group.h"
#include "discard.h"
#include "subpage.h"
+#include "inode-item.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -37,7 +38,7 @@ struct btrfs_trim_range {
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info);
+ struct btrfs_free_space *info, bool update_stat);
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
u64 *bytes, bool for_alloc);
@@ -45,7 +46,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
- u64 bytes);
+ u64 bytes, bool update_stats);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -288,9 +289,18 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct inode *inode)
+ struct inode *vfs_inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(vfs_inode),
+ .new_size = 0,
+ .ino = btrfs_ino(BTRFS_I(vfs_inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
int ret = 0;
bool locked = false;
@@ -320,19 +330,26 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
}
- btrfs_i_size_write(BTRFS_I(inode), 0);
- truncate_pagecache(inode, 0);
+ btrfs_i_size_write(inode, 0);
+ truncate_pagecache(vfs_inode, 0);
+
+ lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
/*
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
- ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, BTRFS_EXTENT_DATA_KEY, NULL);
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
+
+ unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
fail:
if (locked)
@@ -666,7 +683,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->block_group;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
@@ -872,7 +889,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
ret = btrfs_add_free_space(block_group, info->offset,
info->bytes);
kmem_cache_free(btrfs_free_space_cachep, info);
@@ -886,7 +903,7 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
bytes);
if (ret)
break;
- bitmap_clear_bits(ctl, info, offset, bytes);
+ bitmap_clear_bits(ctl, info, offset, bytes, true);
offset = info->offset;
bytes = ctl->unit;
}
@@ -1581,6 +1598,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
}
/*
+ * This is a little subtle. We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0. In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes. So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have. For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
+ * call into find_free_space(), and return with max_extent_size == 4K, because
+ * that first bitmap entry had ->max_extent_size set, but the second one did
+ * not. If instead we returned 8K we'd come in searching for 8K, and find the
+ * 8K contiguous range.
+ *
+ * Consider the other case, we have 2 8K chunks in that second entry and still
+ * don't have ->max_extent_size set. We'll return 16K, and the next time the
+ * allocator comes in it'll fully search our second bitmap, and this time it'll
+ * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
+ * right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
+{
+ if (entry->bitmap && entry->max_extent_size)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
+/*
+ * We want the largest entry to be leftmost, so this is inverted from what you'd
+ * normally expect.
+ */
+static bool entry_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct btrfs_free_space *entry, *exist;
+
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
+ return get_max_extent_size(exist) < get_max_extent_size(entry);
+}
+
+/*
* searches the tree for the given offset.
*
* fuzzy - If this is set, then we are trying to make an allocation, and we just
@@ -1592,15 +1653,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
u64 offset, int bitmap_only, int fuzzy)
{
struct rb_node *n = ctl->free_space_offset.rb_node;
- struct btrfs_free_space *entry, *prev = NULL;
+ struct btrfs_free_space *entry = NULL, *prev = NULL;
/* find entry that is closest to the 'offset' */
- while (1) {
- if (!n) {
- entry = NULL;
- break;
- }
-
+ while (n) {
entry = rb_entry(n, struct btrfs_free_space, offset_index);
prev = entry;
@@ -1610,6 +1666,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
n = n->rb_right;
else
break;
+
+ entry = NULL;
}
if (bitmap_only) {
@@ -1686,6 +1744,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
return NULL;
while (1) {
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
if (entry->offset + BITS_PER_BITMAP *
ctl->unit > offset)
@@ -1694,33 +1756,25 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
if (entry->offset + entry->bytes > offset)
break;
}
-
- n = rb_next(&entry->offset_index);
- if (!n)
- return NULL;
- entry = rb_entry(n, struct btrfs_free_space, offset_index);
}
return entry;
}
-static inline void
-__unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
+static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
{
rb_erase(&info->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
ctl->free_extents--;
if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
}
-}
-static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- __unlink_free_space(ctl, info);
- ctl->free_space -= info->bytes;
+ if (update_stat)
+ ctl->free_space -= info->bytes;
}
static int link_free_space(struct btrfs_free_space_ctl *ctl,
@@ -1734,6 +1788,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
if (ret)
return ret;
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+
if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR]++;
ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1744,9 +1800,25 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
return ret;
}
-static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info,
- u64 offset, u64 bytes)
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ ASSERT(info->bitmap);
+
+ /*
+ * If our entry is empty it's because we're on a cluster and we don't
+ * want to re-link it into our ctl bytes index.
+ */
+ if (RB_EMPTY_NODE(&info->bytes_index))
+ return;
+
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+}
+
+static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes, bool update_stat)
{
unsigned long start, count, end;
int extent_delta = -1;
@@ -1762,6 +1834,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
if (info->max_extent_size > ctl->unit)
info->max_extent_size = 0;
+ relink_bitmap_entry(ctl, info);
+
if (start && test_bit(start - 1, info->bitmap))
extent_delta++;
@@ -1773,14 +1847,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
}
-}
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info, u64 offset,
- u64 bytes)
-{
- __bitmap_clear_bits(ctl, info, offset, bytes);
- ctl->free_space -= bytes;
+ if (update_stat)
+ ctl->free_space -= bytes;
}
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
@@ -1797,9 +1866,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
bitmap_set(info->bitmap, start, count);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
info->bytes += bytes;
ctl->free_space += bytes;
+ relink_bitmap_entry(ctl, info);
+
if (start && test_bit(start - 1, info->bitmap))
extent_delta--;
@@ -1867,20 +1943,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
*bytes = (u64)(max_bits) * ctl->unit;
bitmap_info->max_extent_size = *bytes;
+ relink_bitmap_entry(ctl, bitmap_info);
return -1;
}
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
- if (entry->bitmap)
- return entry->max_extent_size;
- return entry->bytes;
-}
-
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
- unsigned long align, u64 *max_extent_size)
+ unsigned long align, u64 *max_extent_size, bool use_bytes_index)
{
struct btrfs_free_space *entry;
struct rb_node *node;
@@ -1890,16 +1960,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (!ctl->free_space_offset.rb_node)
goto out;
+again:
+ if (use_bytes_index) {
+ node = rb_first_cached(&ctl->free_space_bytes);
+ } else {
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+ 0, 1);
+ if (!entry)
+ goto out;
+ node = &entry->offset_index;
+ }
- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
- if (!entry)
- goto out;
+ for (; node; node = rb_next(node)) {
+ if (use_bytes_index)
+ entry = rb_entry(node, struct btrfs_free_space,
+ bytes_index);
+ else
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
- for (node = &entry->offset_index; node; node = rb_next(node)) {
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ /*
+ * If we are using the bytes index then all subsequent entries
+ * in this tree are going to be < bytes, so simply set the max
+ * extent size and exit the loop.
+ *
+ * If we're using the offset index then we need to keep going
+ * through the rest of the tree.
+ */
if (entry->bytes < *bytes) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
+ if (use_bytes_index)
+ break;
continue;
}
@@ -1916,6 +2008,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
tmp = entry->offset;
}
+ /*
+ * We don't break here if we're using the bytes index because we
+ * may have another entry that has the correct alignment that is
+ * the right size, so we don't want to miss that possibility.
+ * At worst this adds another loop through the logic, but if we
+ * broke here we could prematurely ENOSPC.
+ */
if (entry->bytes < *bytes + align_off) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
@@ -1923,6 +2022,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bitmap) {
+ struct rb_node *old_next = rb_next(node);
u64 size = *bytes;
ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -1935,6 +2035,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
max(get_max_extent_size(entry),
*max_extent_size);
}
+
+ /*
+ * The bitmap may have gotten re-arranged in the space
+ * index here because the max_extent_size may have been
+ * updated. Start from the beginning again if this
+ * happened.
+ */
+ if (use_bytes_index && old_next != rb_next(node))
+ goto again;
continue;
}
@@ -1973,7 +2082,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
}
- unlink_free_space(ctl, bitmap_info);
+ unlink_free_space(ctl, bitmap_info, true);
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
@@ -2011,7 +2120,7 @@ again:
/* Cannot clear past the end of the bitmap */
search_bytes = min(search_bytes, end - search_start + 1);
- bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
*offset += search_bytes;
*bytes -= search_bytes;
@@ -2083,12 +2192,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
- /*
- * We set some bytes, we have no idea what the max extent size is
- * anymore.
- */
- info->max_extent_size = 0;
-
return bytes_to_set;
}
@@ -2096,7 +2199,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
@@ -2165,7 +2268,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
return 0;
if (ctl->op == &free_space_op)
- block_group = ctl->private;
+ block_group = ctl->block_group;
again:
/*
* Since we link bitmaps right into the cluster we need to see if we
@@ -2310,10 +2413,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
/* See try_merge_free_space() comment. */
if (right_info && !right_info->bitmap &&
(!is_trimmed || btrfs_free_space_trimmed(right_info))) {
- if (update_stat)
- unlink_free_space(ctl, right_info);
- else
- __unlink_free_space(ctl, right_info);
+ unlink_free_space(ctl, right_info, update_stat);
info->bytes += right_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, right_info);
merged = true;
@@ -2323,10 +2423,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (left_info && !left_info->bitmap &&
left_info->offset + left_info->bytes == offset &&
(!is_trimmed || btrfs_free_space_trimmed(left_info))) {
- if (update_stat)
- unlink_free_space(ctl, left_info);
- else
- __unlink_free_space(ctl, left_info);
+ unlink_free_space(ctl, left_info, update_stat);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, left_info);
@@ -2362,10 +2459,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
- if (update_stat)
- bitmap_clear_bits(ctl, bitmap, end, bytes);
- else
- __bitmap_clear_bits(ctl, bitmap, end, bytes);
+ bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
if (!bitmap->bytes)
free_bitmap(ctl, bitmap);
@@ -2419,10 +2513,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
- if (update_stat)
- bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
- else
- __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
if (!bitmap->bytes)
free_bitmap(ctl, bitmap);
@@ -2466,12 +2557,12 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
}
}
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_free_space_ctl *ctl,
+int __btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
int ret = 0;
u64 filter_bytes = bytes;
@@ -2486,6 +2577,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
info->bytes = bytes;
info->trim_state = trim_state;
RB_CLEAR_NODE(&info->offset_index);
+ RB_CLEAR_NODE(&info->bytes_index);
spin_lock(&ctl->tree_lock);
@@ -2602,9 +2694,7 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size, trim_state);
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
@@ -2635,9 +2725,7 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size, trim_state);
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
@@ -2696,7 +2784,7 @@ again:
re_search = false;
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
if (offset == info->offset) {
u64 to_free = min(bytes, info->bytes);
@@ -2732,7 +2820,7 @@ again:
}
spin_unlock(&ctl->tree_lock);
- ret = __btrfs_add_free_space(block_group->fs_info, ctl,
+ ret = __btrfs_add_free_space(block_group,
offset + bytes,
old_end - (offset + bytes),
info->trim_state);
@@ -2797,8 +2885,9 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
ctl->start = block_group->start;
- ctl->private = block_group;
+ ctl->block_group = block_group;
ctl->op = &free_space_op;
+ ctl->free_space_bytes = RB_ROOT_CACHED;
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
@@ -2864,6 +2953,8 @@ static void __btrfs_return_cluster_to_free_space(
}
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
+ rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
+ entry_less);
}
cluster->root = RB_ROOT;
spin_unlock(&cluster->lock);
@@ -2879,7 +2970,7 @@ static void __btrfs_remove_free_space_cache_locked(
while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
info = rb_entry(node, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
kmem_cache_free(btrfs_free_space_cachep, info);
} else {
free_bitmap(ctl, info);
@@ -2893,8 +2984,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
- if (ctl->private)
- btrfs_discard_update_discardable(ctl->private);
+ if (ctl->block_group)
+ btrfs_discard_update_discardable(ctl->block_group);
spin_unlock(&ctl->tree_lock);
}
@@ -2965,18 +3056,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 align_gap = 0;
u64 align_gap_len = 0;
enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ bool use_bytes_index = (offset == block_group->start);
ASSERT(!btrfs_is_zoned(block_group->fs_info));
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
- block_group->full_stripe_len, max_extent_size);
+ block_group->full_stripe_len, max_extent_size,
+ use_bytes_index);
if (!entry)
goto out;
ret = offset;
if (entry->bitmap) {
- bitmap_clear_bits(ctl, entry, offset, bytes);
+ bitmap_clear_bits(ctl, entry, offset, bytes, true);
if (!btrfs_free_space_trimmed(entry))
atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
@@ -2984,7 +3077,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
align_gap_len = offset - entry->offset;
align_gap = entry->offset;
align_gap_trim_state = entry->trim_state;
@@ -3006,8 +3099,7 @@ out:
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
- __btrfs_add_free_space(block_group->fs_info, ctl,
- align_gap, align_gap_len,
+ __btrfs_add_free_space(block_group, align_gap, align_gap_len,
align_gap_trim_state);
return ret;
}
@@ -3078,7 +3170,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
}
ret = search_start;
- __bitmap_clear_bits(ctl, entry, ret, bytes);
+ bitmap_clear_bits(ctl, entry, ret, bytes, false);
return ret;
}
@@ -3254,6 +3346,17 @@ again:
cluster->window_start = start * ctl->unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+ /*
+ * We need to know if we're currently on the normal space index when we
+ * manipulate the bitmap so that we know we need to remove and re-insert
+ * it into the space_index tree. Clear the bytes_index node here so the
+ * bitmap manipulation helpers know not to mess with the space_index
+ * until this bitmap entry is added back into the normal cache.
+ */
+ RB_CLEAR_NODE(&entry->bytes_index);
+
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3344,6 +3447,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
total_size += entry->bytes;
@@ -3535,13 +3639,13 @@ static int do_trimming(struct btrfs_block_group *block_group,
mutex_lock(&ctl->cache_writeout_mutex);
if (reserved_start < start)
- __btrfs_add_free_space(fs_info, ctl, reserved_start,
+ __btrfs_add_free_space(block_group, reserved_start,
start - reserved_start,
reserved_trim_state);
if (start + bytes < reserved_start + reserved_bytes)
- __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
+ __btrfs_add_free_space(block_group, end, reserved_end - end,
reserved_trim_state);
- __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
+ __btrfs_add_free_space(block_group, start, bytes, trim_state);
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3615,7 +3719,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
/*
* Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
* If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
@@ -3641,7 +3745,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
goto next;
}
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
kmem_cache_free(btrfs_free_space_cachep, entry);
}
@@ -3828,7 +3932,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
bytes > (max_discard_size + minlen))
bytes = max_discard_size;
- bitmap_clear_bits(ctl, entry, start, bytes);
+ bitmap_clear_bits(ctl, entry, start, bytes, true);
if (entry->bytes == 0)
free_bitmap(ctl, entry);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 1f23088d43f9..15591b299895 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
struct btrfs_free_space {
struct rb_node offset_index;
+ struct rb_node bytes_index;
u64 offset;
u64 bytes;
u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
+ struct rb_root_cached free_space_bytes;
u64 free_space;
int extents_thresh;
int free_extents;
@@ -54,7 +56,7 @@ struct btrfs_free_space_ctl {
s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
const struct btrfs_free_space_op *op;
- void *private;
+ struct btrfs_block_group *block_group;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
};
@@ -101,10 +103,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl);
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_free_space_ctl *ctl,
- u64 bytenr, u64 size,
- enum btrfs_trim_state trim_state);
+int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
+ u64 size, enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index a33bca94d133..655aad0f9e1c 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -16,6 +16,18 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
+static struct btrfs_root *btrfs_free_space_root(
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ return btrfs_global_root(block_group->fs_info, &key);
+}
+
void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
@@ -51,7 +63,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -85,7 +97,7 @@ struct btrfs_free_space_info *search_free_space_info(
struct btrfs_path *path, int cow)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
int ret;
@@ -188,7 +200,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -326,7 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -586,7 +598,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
- struct btrfs_root *root = block_group->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
@@ -699,7 +711,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 found_start, found_end;
u64 end = start + size;
@@ -851,7 +863,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key, new_key;
u64 found_start, found_end;
u64 end = start + size;
@@ -1046,7 +1058,7 @@ out:
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
- struct btrfs_root *extent_root = trans->fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path, *path2;
struct btrfs_key key;
u64 start, end;
@@ -1080,6 +1092,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
+ extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
@@ -1157,7 +1170,11 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
ret = PTR_ERR(free_space_root);
goto abort;
}
- fs_info->free_space_root = free_space_root;
+ ret = btrfs_global_root_insert(free_space_root);
+ if (ret) {
+ btrfs_put_root(free_space_root);
+ goto abort;
+ }
node = rb_first(&fs_info->block_group_cache_tree);
while (node) {
@@ -1232,7 +1249,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_root *free_space_root = fs_info->free_space_root;
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
int ret;
trans = btrfs_start_transaction(tree_root, 0);
@@ -1241,7 +1263,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
- fs_info->free_space_root = NULL;
ret = clear_free_space_tree(trans, free_space_root);
if (ret)
@@ -1251,13 +1272,14 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto abort;
+ btrfs_global_root_delete(free_space_root);
list_del(&free_space_root->dirty_list);
btrfs_tree_lock(free_space_root->node);
btrfs_clean_tree_block(free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
- btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
- 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
@@ -1319,7 +1341,7 @@ out:
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_path *path;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -1410,7 +1432,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- root = fs_info->free_space_root;
+ root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
@@ -1488,7 +1510,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- root = fs_info->free_space_root;
+ root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 37f36ffdaf6b..0eeb5ea87894 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,6 +4,7 @@
*/
#include "ctree.h"
+#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
@@ -19,7 +20,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
u32 cur_offset = 0;
int len;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
@@ -45,7 +46,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
u32 cur_offset = 0;
int ref_name_len;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
/*
@@ -139,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (index)
*index = btrfs_inode_extref_index(leaf, extref);
@@ -208,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (index)
*index = btrfs_inode_ref_index(leaf, ref);
@@ -256,7 +257,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_item *item;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
@@ -282,9 +282,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
goto out;
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
- ptr += btrfs_item_size(leaf, item) - ins_len;
+ ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
@@ -332,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ref)
goto out;
- old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
btrfs_extend_item(path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
@@ -419,3 +418,332 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
}
return ret;
}
+
+static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
+ struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ u64 offset, int extent_type, int slot)
+{
+ if (!inode)
+ return;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot,
+ offset);
+ else
+ trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset);
+}
+
+/*
+ * Remove inode items from a given root.
+ *
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @control: The btrfs_truncate_control to control how and what we
+ * are truncating.
+ *
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 new_size = control->new_size;
+ u64 extent_num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 item_end = 0;
+ u32 found_type = (u8)-1;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int ret;
+ u64 bytes_deleted = 0;
+ bool be_nice = false;
+
+ ASSERT(control->inode || !control->clear_extent_range);
+ ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);
+
+ control->last_size = new_size;
+ control->sub_bytes = 0;
+
+ /*
+ * For shareable roots we want to back off from time to time, this turns
+ * out to be subvolume roots, reloc roots, and data reloc roots.
+ */
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ be_nice = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_BACK;
+
+ key.objectid = control->ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+search_again:
+ /*
+ * With a 16K leaf size and 128MiB extents, you can actually queue up a
+ * huge file in a single leaf. Most of the time that bytes_deleted is
+ * > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > SZ_32M &&
+ btrfs_should_end_transaction(trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = 0;
+ /* There are no items in the tree for us to truncate, we're done */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (1) {
+ u64 clear_start = 0, clear_len = 0, extent_start = 0;
+ bool should_throttle = false;
+
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = found_key.type;
+
+ if (found_key.objectid != control->ino)
+ break;
+
+ if (found_type < control->min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE)
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ item_end += btrfs_file_extent_ram_bytes(leaf, fi);
+
+ btrfs_trace_truncate(control->inode, leaf, fi,
+ found_key.offset, extent_type,
+ path->slots[0]);
+ item_end--;
+ }
+ if (found_type > control->min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
+ break;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ }
+
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ control->extents_found++;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+
+ clear_start = found_key.offset;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ fs_info->sectorsize);
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes - extent_num_bytes);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = found_key.offset -
+ btrfs_file_extent_offset(leaf, fi);
+
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ }
+ clear_len = num_dec;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * We can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+ btrfs_file_extent_compression(leaf, fi) == 0) {
+ u32 size = (u32)(new_size - found_key.offset);
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(path, size, 1);
+ } else if (!del_item) {
+ /*
+ * We have to bail so the last_size is set to
+ * just before this extent.
+ */
+ ret = BTRFS_NEED_TRUNCATE_BLOCK;
+ break;
+ } else {
+ /*
+ * Inline extents are special, we just treat
+ * them as a full sector worth in the file
+ * extent tree just for simplicity sake.
+ */
+ clear_len = fs_info->sectorsize;
+ }
+
+ control->sub_bytes += item_end + 1 - new_size;
+ }
+delete:
+ /*
+ * We only want to clear the file extent range if we're
+ * modifying the actual inode's mapping, which is just the
+ * normal truncate path.
+ */
+ if (control->clear_extent_range) {
+ ret = btrfs_inode_clear_file_extent_range(control->inode,
+ clear_start, clear_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (del_item) {
+ ASSERT(!pending_del_nr ||
+ ((path->slots[0] + 1) == pending_del_slot));
+
+ control->last_size = found_key.offset;
+ if (!pending_del_nr) {
+ /* No pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* Hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ }
+ } else {
+ control->last_size = new_size;
+ break;
+ }
+
+ if (del_item && extent_start != 0 && !control->skip_ref_updates) {
+ struct btrfs_ref ref = { 0 };
+
+ bytes_deleted += extent_num_bytes;
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+ extent_start, extent_num_bytes, 0);
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ control->ino, extent_offset,
+ root->root_key.objectid, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ if (be_nice) {
+ if (btrfs_should_throttle_delayed_refs(trans))
+ should_throttle = true;
+ }
+ }
+
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot ||
+ should_throttle) {
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ pending_del_nr = 0;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * We can generate a lot of delayed refs, so we need to
+ * throttle every once and a while and make sure we're
+ * adding enough space to keep up with the work we are
+ * generating. Since we hold a transaction here we
+ * can't flush, and we don't want to FLUSH_LIMIT because
+ * we could have generated too many delayed refs to
+ * actually allocate, so just bail if we're short and
+ * let the normal reservation dance happen higher up.
+ */
+ if (should_throttle) {
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ goto search_again;
+ } else {
+ path->slots[0]--;
+ }
+ }
+out:
+ if (ret >= 0 && pending_del_nr) {
+ int err;
+
+ err = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ ret = err;
+ }
+ }
+
+ ASSERT(control->last_size >= new_size);
+ if (!ret && control->last_size > new_size)
+ control->last_size = new_size;
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
new file mode 100644
index 000000000000..a8fc16d0147f
--- /dev/null
+++ b/fs/btrfs/inode-item.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_INODE_ITEM_H
+#define BTRFS_INODE_ITEM_H
+
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_key;
+struct btrfs_inode_extref;
+struct btrfs_inode;
+struct extent_buffer;
+
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define BTRFS_NEED_TRUNCATE_BLOCK 1
+
+struct btrfs_truncate_control {
+ /*
+ * IN: the inode we're operating on, this can be NULL if
+ * ->clear_extent_range is false.
+ */
+ struct btrfs_inode *inode;
+
+ /* IN: the size we're truncating to. */
+ u64 new_size;
+
+ /* OUT: the number of extents truncated. */
+ u64 extents_found;
+
+ /* OUT: the last size we truncated this inode to. */
+ u64 last_size;
+
+ /* OUT: the number of bytes to sub from this inode. */
+ u64 sub_bytes;
+
+ /* IN: the ino we are truncating. */
+ u64 ino;
+
+ /*
+ * IN: minimum key type to remove. All key types with this type are
+ * removed only if their offset >= new_size.
+ */
+ u32 min_type;
+
+ /*
+ * IN: true if we don't want to do extent reference updates for any file
+ * extents we drop.
+ */
+ bool skip_ref_updates;
+
+ /*
+ * IN: true if we need to clear the file extent range for the inode as
+ * we drop the file extent items.
+ */
+ bool clear_extent_range;
+};
+
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control);
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot, const char *name,
+ int name_len);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const char *name, int name_len);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b8c911a4a320..3b2403b6127f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -54,6 +54,7 @@
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
+#include "inode-item.h"
struct btrfs_iget_args {
u64 ino;
@@ -61,8 +62,6 @@ struct btrfs_iget_args {
};
struct btrfs_dio_data {
- u64 reserve;
- loff_t length;
ssize_t submitted;
struct extent_changeset *data_reserved;
};
@@ -1532,11 +1531,12 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
- int ret;
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
struct btrfs_ordered_sum *sums;
+ int ret;
LIST_HEAD(list);
- ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
+ ret = btrfs_lookup_csums_range(csum_root, bytenr,
bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list))
return 0;
@@ -2518,7 +2518,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- !fs_info->csum_root;
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
@@ -2586,11 +2586,15 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
struct list_head *list)
{
struct btrfs_ordered_sum *sum;
+ struct btrfs_root *csum_root = NULL;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
+ if (!csum_root)
+ csum_root = btrfs_csum_root(trans->fs_info,
+ sum->bytenr);
+ ret = btrfs_csum_file_blocks(trans, csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -3316,7 +3320,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (!root->fs_info->csum_root)
+ if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
return 0;
ASSERT(page_offset(page) <= start &&
@@ -3477,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
- if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
path = btrfs_alloc_path();
@@ -3635,8 +3639,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* release the path since we're done with it */
btrfs_release_path(path);
- root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
-
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
@@ -4615,389 +4617,6 @@ out:
}
/*
- * Return this if we need to call truncate_block for the last bit of the
- * truncate.
- */
-#define NEED_TRUNCATE_BLOCK 1
-
-/*
- * Remove inode items from a given root.
- *
- * @trans: A transaction handle.
- * @root: The root from which to remove items.
- * @inode: The inode whose items we want to remove.
- * @new_size: The new i_size for the inode. This is only applicable when
- * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
- * @min_type: The minimum key type to remove. All keys with a type
- * greater than this value are removed and all keys with
- * this type are removed only if their offset is >= @new_size.
- * @extents_found: Output parameter that will contain the number of file
- * extent items that were removed or adjusted to the new
- * inode i_size. The caller is responsible for initializing
- * the counter. Also, it can be NULL if the caller does not
- * need this counter.
- *
- * Remove all keys associated with the inode from the given root that have a key
- * with a type greater than or equals to @min_type. When @min_type has a value of
- * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
- * greater than or equals to @new_size. If a file extent item that starts before
- * @new_size and ends after it is found, its length is adjusted.
- *
- * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
- * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
- */
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode,
- u64 new_size, u32 min_type,
- u64 *extents_found)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *fi;
- struct btrfs_key key;
- struct btrfs_key found_key;
- u64 extent_start = 0;
- u64 extent_num_bytes = 0;
- u64 extent_offset = 0;
- u64 item_end = 0;
- u64 last_size = new_size;
- u32 found_type = (u8)-1;
- int found_extent;
- int del_item;
- int pending_del_nr = 0;
- int pending_del_slot = 0;
- int extent_type = -1;
- int ret;
- u64 ino = btrfs_ino(inode);
- u64 bytes_deleted = 0;
- bool be_nice = false;
- bool should_throttle = false;
- const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
- struct extent_state *cached_state = NULL;
-
- BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
-
- /*
- * For non-free space inodes and non-shareable roots, we want to back
- * off from time to time. This means all inodes in subvolume roots,
- * reloc roots, and data reloc roots.
- */
- if (!btrfs_is_free_space_inode(inode) &&
- test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- be_nice = true;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_BACK;
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
- &cached_state);
-
- /*
- * We want to drop from the next block forward in case this
- * new size is not block aligned since we will be keeping the
- * last block of the extent just the way it is.
- */
- btrfs_drop_extent_cache(inode, ALIGN(new_size,
- fs_info->sectorsize),
- (u64)-1, 0);
- }
-
- /*
- * This function is also used to drop the items in the log tree before
- * we relog the inode, so if root != BTRFS_I(inode)->root, it means
- * it is used to drop the logged items. So we shouldn't kill the delayed
- * items.
- */
- if (min_type == 0 && root == inode->root)
- btrfs_kill_delayed_inode_items(inode);
-
- key.objectid = ino;
- key.offset = (u64)-1;
- key.type = (u8)-1;
-
-search_again:
- /*
- * with a 16K leaf size and 128MB extents, you can actually queue
- * up a huge file in a single leaf. Most of the time that
- * bytes_deleted is > 0, it will be huge by the time we get here
- */
- if (be_nice && bytes_deleted > SZ_32M &&
- btrfs_should_end_transaction(trans)) {
- ret = -EAGAIN;
- goto out;
- }
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = 0;
- /* there are no items in the tree for us to truncate, we're
- * done
- */
- if (path->slots[0] == 0)
- goto out;
- path->slots[0]--;
- }
-
- while (1) {
- u64 clear_start = 0, clear_len = 0;
-
- fi = NULL;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
-
- if (found_key.objectid != ino)
- break;
-
- if (found_type < min_type)
- break;
-
- item_end = found_key.offset;
- if (found_type == BTRFS_EXTENT_DATA_KEY) {
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(leaf, fi);
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- item_end +=
- btrfs_file_extent_num_bytes(leaf, fi);
-
- trace_btrfs_truncate_show_fi_regular(
- inode, leaf, fi, found_key.offset);
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- item_end += btrfs_file_extent_ram_bytes(leaf,
- fi);
-
- trace_btrfs_truncate_show_fi_inline(
- inode, leaf, fi, path->slots[0],
- found_key.offset);
- }
- item_end--;
- }
- if (found_type > min_type) {
- del_item = 1;
- } else {
- if (item_end < new_size)
- break;
- if (found_key.offset >= new_size)
- del_item = 1;
- else
- del_item = 0;
- }
- found_extent = 0;
- /* FIXME, shrink the extent if the ref count is only 1 */
- if (found_type != BTRFS_EXTENT_DATA_KEY)
- goto delete;
-
- if (extents_found != NULL)
- (*extents_found)++;
-
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- u64 num_dec;
-
- clear_start = found_key.offset;
- extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (!del_item) {
- u64 orig_num_bytes =
- btrfs_file_extent_num_bytes(leaf, fi);
- extent_num_bytes = ALIGN(new_size -
- found_key.offset,
- fs_info->sectorsize);
- clear_start = ALIGN(new_size, fs_info->sectorsize);
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_num_bytes);
- num_dec = (orig_num_bytes -
- extent_num_bytes);
- if (test_bit(BTRFS_ROOT_SHAREABLE,
- &root->state) &&
- extent_start != 0)
- inode_sub_bytes(&inode->vfs_inode,
- num_dec);
- btrfs_mark_buffer_dirty(leaf);
- } else {
- extent_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf,
- fi);
- extent_offset = found_key.offset -
- btrfs_file_extent_offset(leaf, fi);
-
- /* FIXME blocksize != 4096 */
- num_dec = btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_start != 0) {
- found_extent = 1;
- if (test_bit(BTRFS_ROOT_SHAREABLE,
- &root->state))
- inode_sub_bytes(&inode->vfs_inode,
- num_dec);
- }
- }
- clear_len = num_dec;
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- /*
- * we can't truncate inline items that have had
- * special encodings
- */
- if (!del_item &&
- btrfs_file_extent_encryption(leaf, fi) == 0 &&
- btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
- btrfs_file_extent_compression(leaf, fi) == 0) {
- u32 size = (u32)(new_size - found_key.offset);
-
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(path, size, 1);
- } else if (!del_item) {
- /*
- * We have to bail so the last_size is set to
- * just before this extent.
- */
- ret = NEED_TRUNCATE_BLOCK;
- break;
- } else {
- /*
- * Inline extents are special, we just treat
- * them as a full sector worth in the file
- * extent tree just for simplicity sake.
- */
- clear_len = fs_info->sectorsize;
- }
-
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- inode_sub_bytes(&inode->vfs_inode,
- item_end + 1 - new_size);
- }
-delete:
- /*
- * We use btrfs_truncate_inode_items() to clean up log trees for
- * multiple fsyncs, and in this case we don't want to clear the
- * file extent range because it's just the log.
- */
- if (root == inode->root) {
- ret = btrfs_inode_clear_file_extent_range(inode,
- clear_start, clear_len);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- }
-
- if (del_item)
- last_size = found_key.offset;
- else
- last_size = new_size;
- if (del_item) {
- if (!pending_del_nr) {
- /* no pending yet, add ourselves */
- pending_del_slot = path->slots[0];
- pending_del_nr = 1;
- } else if (pending_del_nr &&
- path->slots[0] + 1 == pending_del_slot) {
- /* hop on the pending chunk */
- pending_del_nr++;
- pending_del_slot = path->slots[0];
- } else {
- BUG();
- }
- } else {
- break;
- }
- should_throttle = false;
-
- if (found_extent &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_ref ref = { 0 };
-
- bytes_deleted += extent_num_bytes;
-
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0);
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- ino, extent_offset,
- root->root_key.objectid, false);
- ret = btrfs_free_extent(trans, &ref);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- if (be_nice) {
- if (btrfs_should_throttle_delayed_refs(trans))
- should_throttle = true;
- }
- }
-
- if (found_type == BTRFS_INODE_ITEM_KEY)
- break;
-
- if (path->slots[0] == 0 ||
- path->slots[0] != pending_del_slot ||
- should_throttle) {
- if (pending_del_nr) {
- ret = btrfs_del_items(trans, root, path,
- pending_del_slot,
- pending_del_nr);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- pending_del_nr = 0;
- }
- btrfs_release_path(path);
-
- /*
- * We can generate a lot of delayed refs, so we need to
- * throttle every once and a while and make sure we're
- * adding enough space to keep up with the work we are
- * generating. Since we hold a transaction here we
- * can't flush, and we don't want to FLUSH_LIMIT because
- * we could have generated too many delayed refs to
- * actually allocate, so just bail if we're short and
- * let the normal reservation dance happen higher up.
- */
- if (should_throttle) {
- ret = btrfs_delayed_refs_rsv_refill(fs_info,
- BTRFS_RESERVE_NO_FLUSH);
- if (ret) {
- ret = -EAGAIN;
- break;
- }
- }
- goto search_again;
- } else {
- path->slots[0]--;
- }
- }
-out:
- if (ret >= 0 && pending_del_nr) {
- int err;
-
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
- }
- }
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ASSERT(last_size >= new_size);
- if (!ret && last_size > new_size)
- last_size = new_size;
- btrfs_inode_safe_disk_i_size_write(inode, last_size);
- unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
- &cached_state);
- }
-
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* btrfs_truncate_block - read, zero a chunk and write a block
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
@@ -5525,7 +5144,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_trans_handle *trans;
u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
int ret;
@@ -5540,18 +5158,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* above. We reserve our extra bit here because we generate a ton of
* delayed refs activity by truncating.
*
- * If we cannot make our reservation we'll attempt to steal from the
- * global reserve, because we really want to be able to free up space.
+ * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
+ * if we fail to make this reservation we can re-try without the
+ * delayed_refs_extra so we can make some forward progress.
*/
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_EVICT);
if (ret) {
- /*
- * Try to steal from the global reserve if there is space for
- * it.
- */
- if (btrfs_check_space_for_delayed_refs(fs_info) ||
- btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
+ BTRFS_RESERVE_FLUSH_EVICT);
+ if (ret) {
btrfs_warn(fs_info,
"could not allocate space for delete; will truncate on mount");
return ERR_PTR(-ENOSPC);
@@ -5610,10 +5226,22 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ /*
+ * This makes sure the inode item in tree is uptodate and the space for
+ * the inode update is released.
+ */
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
if (ret)
goto no_delete;
+ /*
+ * This drops any pending insert or delete operations we have for this
+ * inode. We could have a delayed dir index deletion queued up, but
+ * we're removing the inode completely so that'll be taken care of in
+ * the truncate.
+ */
+ btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
@@ -5623,14 +5251,20 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .new_size = 0,
+ .min_type = 0,
+ };
+
trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
trans->block_rsv = rsv;
- ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, 0, NULL);
+ ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -6998,8 +6632,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
- inline_size = btrfs_file_extent_inline_item_len(leaf,
- btrfs_item_nr(path->slots[0]));
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
tmp = kmalloc(inline_size, GFP_NOFS);
if (!tmp)
return -ENOMEM;
@@ -7773,6 +7406,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+ bool can_nocow = false;
+ bool space_reserved = false;
int ret = 0;
/*
@@ -7787,9 +7424,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
-
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
type = BTRFS_ORDERED_PREALLOC;
else
@@ -7799,53 +7433,92 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
if (can_nocow_extent(inode, start, &len, &orig_start,
&orig_block_len, &ram_bytes, false) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start)) {
- struct extent_map *em2;
+ btrfs_inc_nocow_writers(fs_info, block_start))
+ can_nocow = true;
+ }
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
+ if (can_nocow) {
+ struct extent_map *em2;
+
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
btrfs_dec_nocow_writers(fs_info, block_start);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em = em2;
- }
+ goto out;
+ }
+ space_reserved = true;
- if (em2 && IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
- /*
- * For inode marked NODATACOW or extent marked PREALLOC,
- * use the existing or preallocated extent, so does not
- * need to adjust btrfs_space_info's bytes_may_use.
- */
- btrfs_free_reserved_data_space_noquota(fs_info, len);
- goto skip_cow;
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(fs_info, block_start);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em = em2;
}
- }
- /* this will cow the extent */
- free_extent_map(em);
- *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+ } else {
+ const u64 prev_len = len;
+
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ /* We have to COW, so need to reserve metadata and data space. */
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, len);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start + len, prev_len - len,
+ true);
}
- len = min(len, em->len - (start - em->start));
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
-skip_cow:
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
-
- dio_data->reserve -= len;
out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ if (can_nocow) {
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ } else {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, len, true);
+ extent_changeset_free(dio_data->data_reserved);
+ dio_data->data_reserved = NULL;
+ }
+ }
return ret;
}
@@ -7887,18 +7560,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (!dio_data)
return -ENOMEM;
- dio_data->length = length;
- if (write) {
- dio_data->reserve = round_up(length, fs_info->sectorsize);
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, dio_data->reserve);
- if (ret) {
- extent_changeset_free(dio_data->data_reserved);
- kfree(dio_data);
- return ret;
- }
- }
iomap->private = dio_data;
@@ -7991,14 +7652,8 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data) {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved, start,
- dio_data->reserve, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
- extent_changeset_free(dio_data->data_reserved);
- kfree(dio_data);
- }
+ kfree(dio_data);
+
return ret;
}
@@ -8028,14 +7683,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ret = -ENOTBLK;
}
- if (write) {
- if (dio_data->reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved, pos,
- dio_data->reserve, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ if (write)
extent_changeset_free(dio_data->data_reserved);
- }
out:
kfree(dio_data);
iomap->private = NULL;
@@ -8884,6 +8533,12 @@ out_noreserve:
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
@@ -8891,7 +8546,6 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
- u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8952,10 +8606,30 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
trans->block_rsv = rsv;
while (1) {
- ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- inode->i_size,
- BTRFS_EXTENT_DATA_KEY,
- &extents_found);
+ struct extent_state *cached_state = NULL;
+ const u64 new_size = inode->i_size;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+
+ control.new_size = new_size;
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+ /*
+ * We want to drop from the next block forward in case this new
+ * size is not block aligned since we will be keeping the last
+ * block of the extent just the way it is.
+ */
+ btrfs_drop_extent_cache(BTRFS_I(inode),
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, 0);
+
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+ (u64)-1, &cached_state);
+
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
@@ -8983,11 +8657,11 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/*
* We can't call btrfs_truncate_block inside a trans handle as we could
- * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
- * we've truncated everything except the last little bit, and can do
- * btrfs_truncate_block and then update the disk_i_size.
+ * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
+ * know we've truncated everything except the last little bit, and can
+ * do btrfs_truncate_block and then update the disk_i_size.
*/
- if (ret == NEED_TRUNCATE_BLOCK) {
+ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -9031,7 +8705,7 @@ out:
* between the old i_size and the new i_size, and there were no prealloc
* extents beyond i_size to drop.
*/
- if (extents_found > 0)
+ if (control.extents_found > 0)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
return ret;
@@ -10595,9 +10269,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10605,6 +10289,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fb8cc9642ac4..a5bd6926f7ff 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -387,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
*
* Compatibility:
* - the same type is already running
+ * - when trying to add a device and balance has been paused
* - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
* must check the condition first that would allow none -> @type
*/
@@ -394,7 +395,9 @@ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
spin_lock(&fs_info->super_lock);
- if (fs_info->exclusive_operation == type)
+ if (fs_info->exclusive_operation == type ||
+ (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+ type == BTRFS_EXCLOP_DEV_ADD))
return true;
spin_unlock(&fs_info->super_lock);
@@ -414,6 +417,29 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op)
+{
+ switch (op) {
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ default:
+ btrfs_warn(fs_info,
+ "invalid exclop balance operation %d requested", op);
+ }
+}
+
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
struct inode *inode = file_inode(file);
@@ -518,7 +544,6 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct timespec64 cur_time = current_time(dir);
struct inode *inode;
int ret;
- int err;
dev_t anon_dev = 0;
u64 objectid;
u64 index = 0;
@@ -617,11 +642,13 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
* filesystem in an inconsistent state (an extent item in the
- * extent tree without backreferences). Also no need to have
- * the tree block locked since it is not in any tree at this
- * point, so no other task can find it and use it.
+ * extent tree with a backreference for a root that does not
+ * exists).
*/
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_tree_lock(leaf);
+ btrfs_clean_tree_block(leaf);
+ btrfs_tree_unlock(leaf);
+ btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
goto fail;
}
@@ -696,9 +723,10 @@ fail:
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
- err = btrfs_commit_transaction(trans);
- if (err && !ret)
- ret = err;
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ ret = btrfs_commit_transaction(trans);
if (!ret) {
inode = btrfs_lookup_dentry(dir, dentry);
@@ -2082,7 +2110,7 @@ static noinline int copy_to_sk(struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
item_off = btrfs_item_ptr_offset(leaf, i);
- item_len = btrfs_item_size_nr(leaf, i);
+ item_len = btrfs_item_size(leaf, i);
btrfs_item_key_to_cpu(leaf, key, i);
if (!key_in_sk(key, sk))
@@ -2536,7 +2564,7 @@ static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
btrfs_item_key_to_cpu(leaf, &key, slot);
item_off = btrfs_item_ptr_offset(leaf, slot);
- item_len = btrfs_item_size_nr(leaf, slot);
+ item_len = btrfs_item_size(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
@@ -2738,7 +2766,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
item_off = btrfs_item_ptr_offset(leaf, slot)
+ sizeof(struct btrfs_root_ref);
- item_len = btrfs_item_size_nr(leaf, slot)
+ item_len = btrfs_item_size(leaf, slot)
- sizeof(struct btrfs_root_ref);
read_extent_buffer(leaf, subvol_info->name,
item_off, item_len);
@@ -3146,13 +3174,25 @@ out:
static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
struct btrfs_ioctl_vol_args *vol_args;
+ bool restore_op = false;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
- return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
+ if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+ /*
+ * We can do the device add because we have a paused balanced,
+ * change the exclusive op type and remember we should bring
+ * back the paused balance
+ */
+ fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
+ btrfs_exclop_start_unlock(fs_info);
+ restore_op = true;
+ }
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
@@ -3168,7 +3208,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- btrfs_exclop_finish(fs_info);
+ if (restore_op)
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ else
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3187,10 +3230,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
return -EPERM;
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto out;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
@@ -3622,7 +3663,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
{
struct btrfs_trans_handle *trans;
u64 transid;
- int ret;
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -3634,11 +3674,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
goto out;
}
transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ btrfs_commit_transaction_async(trans);
out:
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -3985,6 +4021,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bool need_unlock; /* for mut. excl. ops lock */
int ret;
+ if (!arg)
+ btrfs_warn(fs_info,
+ "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4057,6 +4097,7 @@ locked:
spin_lock(&fs_info->balance_lock);
bctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
goto do_balance;
}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 65cb0766e62d..0fb90cbe7669 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -125,6 +125,7 @@ static inline size_t read_compress_length(const char *buf)
static int copy_compressed_data_to_page(char *compressed_data,
size_t compressed_size,
struct page **out_pages,
+ unsigned long max_nr_page,
u32 *cur_out,
const u32 sectorsize)
{
@@ -133,6 +134,9 @@ static int copy_compressed_data_to_page(char *compressed_data,
struct page *cur_page;
char *kaddr;
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
/*
* We never allow a segment header crossing sector boundary, previous
* run should ensure we have enough space left inside the sector.
@@ -161,6 +165,10 @@ static int copy_compressed_data_to_page(char *compressed_data,
orig_out + compressed_size - *cur_out);
kunmap(cur_page);
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
@@ -203,6 +211,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
struct page *page_in = NULL;
char *sizes_ptr;
+ const unsigned long max_nr_page = *out_pages;
int ret = 0;
/* Points to the file offset of input data */
u64 cur_in = start;
@@ -210,6 +219,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u32 cur_out = 0;
u32 len = *total_out;
+ ASSERT(max_nr_page > 0);
*out_pages = 0;
*total_out = 0;
*total_in = 0;
@@ -248,7 +258,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
- pages, &cur_out, sectorsize);
+ pages, max_nr_page,
+ &cur_out, sectorsize);
if (ret < 0)
goto out;
@@ -279,6 +290,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = cur_out;
*total_in = cur_in - start;
out:
+ if (page_in)
+ put_page(page_in);
*out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
return ret;
}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index aae1027bd76a..0775ae9f4419 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -85,7 +85,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u64 flags;
u64 offset;
int ref_index = 0;
@@ -200,7 +200,6 @@ void btrfs_print_leaf(struct extent_buffer *l)
struct btrfs_fs_info *fs_info;
int i;
u32 type, nr;
- struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_dir_item *di;
struct btrfs_inode_item *ii;
@@ -224,12 +223,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
- item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
type = key.type;
pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
i, key.objectid, type, key.offset,
- btrfs_item_offset(l, item), btrfs_item_size(l, item));
+ btrfs_item_offset(l, i), btrfs_item_size(l, i));
switch (type) {
case BTRFS_INODE_ITEM_KEY:
ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
@@ -347,7 +345,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
case BTRFS_UUID_KEY_SUBVOL:
case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
- btrfs_item_size_nr(l, i));
+ btrfs_item_size(l, i));
break;
}
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b1cb5a8c2999..1a6d2d5b4b33 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -158,7 +158,7 @@ static int iterate_object_props(struct btrfs_root *root,
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
cur = 0;
- total_len = btrfs_item_size_nr(leaf, slot);
+ total_len = btrfs_item_size(leaf, slot);
while (cur < total_len) {
u32 name_len = btrfs_dir_name_len(leaf, di);
@@ -377,8 +377,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
*/
if (need_reserve) {
num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- num_bytes, BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_block_rsv_add(fs_info, trans->block_rsv,
+ num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
if (ret)
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index db680f5be745..8928275823a1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
@@ -1117,8 +1125,19 @@ out_add_root:
goto out_free_path;
}
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1219,7 +1238,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(quota_root->node);
btrfs_clean_tree_block(quota_root->node);
btrfs_tree_unlock(quota_root->node);
- btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
btrfs_put_root(quota_root);
@@ -3141,6 +3161,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
@@ -3150,7 +3171,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
int ret;
mutex_lock(&fs_info->qgroup_rescan_lock);
- ret = btrfs_search_slot_for_read(fs_info->extent_root,
+ extent_root = btrfs_extent_root(fs_info,
+ fs_info->qgroup_rescan_progress.objectid);
+ ret = btrfs_search_slot_for_read(extent_root,
&fs_info->qgroup_rescan_progress,
path, 1, 0);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
deleted file mode 100644
index eb96fdc3be25..000000000000
--- a/fs/btrfs/reada.c
+++ /dev/null
@@ -1,1086 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2011 STRATO. All rights reserved.
- */
-
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include "ctree.h"
-#include "volumes.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "dev-replace.h"
-#include "block-group.h"
-
-#undef DEBUG
-
-/*
- * This is the implementation for the generic read ahead framework.
- *
- * To trigger a readahead, btrfs_reada_add must be called. It will start
- * a read ahead for the given range [start, end) on tree root. The returned
- * handle can either be used to wait on the readahead to finish
- * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
- *
- * The read ahead works as follows:
- * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
- * reada_start_machine will then search for extents to prefetch and trigger
- * some reads. When a read finishes for a node, all contained node/leaf
- * pointers that lie in the given range will also be enqueued. The reads will
- * be triggered in sequential order, thus giving a big win over a naive
- * enumeration. It will also make use of multi-device layouts. Each disk
- * will have its on read pointer and all disks will by utilized in parallel.
- * Also will no two disks read both sides of a mirror simultaneously, as this
- * would waste seeking capacity. Instead both disks will read different parts
- * of the filesystem.
- * Any number of readaheads can be started in parallel. The read order will be
- * determined globally, i.e. 2 parallel readaheads will normally finish faster
- * than the 2 started one after another.
- */
-
-#define MAX_IN_FLIGHT 6
-
-struct reada_extctl {
- struct list_head list;
- struct reada_control *rc;
- u64 generation;
-};
-
-struct reada_extent {
- u64 logical;
- u64 owner_root;
- struct btrfs_key top;
- struct list_head extctl;
- int refcnt;
- spinlock_t lock;
- struct reada_zone *zones[BTRFS_MAX_MIRRORS];
- int nzones;
- int scheduled;
- int level;
-};
-
-struct reada_zone {
- u64 start;
- u64 end;
- u64 elems;
- struct list_head list;
- spinlock_t lock;
- int locked;
- struct btrfs_device *device;
- struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
- * self */
- int ndevs;
- struct kref refcnt;
-};
-
-struct reada_machine_work {
- struct btrfs_work work;
- struct btrfs_fs_info *fs_info;
-};
-
-static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
-static void reada_control_release(struct kref *kref);
-static void reada_zone_release(struct kref *kref);
-static void reada_start_machine(struct btrfs_fs_info *fs_info);
-static void __reada_start_machine(struct btrfs_fs_info *fs_info);
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 owner_root,
- u64 generation, int level);
-
-/* recurses */
-/* in case of err, eb might be NULL */
-static void __readahead_hook(struct btrfs_fs_info *fs_info,
- struct reada_extent *re, struct extent_buffer *eb,
- int err)
-{
- int nritems;
- int i;
- u64 bytenr;
- u64 generation;
- struct list_head list;
-
- spin_lock(&re->lock);
- /*
- * just take the full list from the extent. afterwards we
- * don't need the lock anymore
- */
- list_replace_init(&re->extctl, &list);
- re->scheduled = 0;
- spin_unlock(&re->lock);
-
- /*
- * this is the error case, the extent buffer has not been
- * read correctly. We won't access anything from it and
- * just cleanup our data structures. Effectively this will
- * cut the branch below this node from read ahead.
- */
- if (err)
- goto cleanup;
-
- /*
- * FIXME: currently we just set nritems to 0 if this is a leaf,
- * effectively ignoring the content. In a next step we could
- * trigger more readahead depending from the content, e.g.
- * fetch the checksums for the extents in the leaf.
- */
- if (!btrfs_header_level(eb))
- goto cleanup;
-
- nritems = btrfs_header_nritems(eb);
- generation = btrfs_header_generation(eb);
- for (i = 0; i < nritems; i++) {
- struct reada_extctl *rec;
- u64 n_gen;
- struct btrfs_key key;
- struct btrfs_key next_key;
-
- btrfs_node_key_to_cpu(eb, &key, i);
- if (i + 1 < nritems)
- btrfs_node_key_to_cpu(eb, &next_key, i + 1);
- else
- next_key = re->top;
- bytenr = btrfs_node_blockptr(eb, i);
- n_gen = btrfs_node_ptr_generation(eb, i);
-
- list_for_each_entry(rec, &list, list) {
- struct reada_control *rc = rec->rc;
-
- /*
- * if the generation doesn't match, just ignore this
- * extctl. This will probably cut off a branch from
- * prefetch. Alternatively one could start a new (sub-)
- * prefetch for this branch, starting again from root.
- * FIXME: move the generation check out of this loop
- */
-#ifdef DEBUG
- if (rec->generation != generation) {
- btrfs_debug(fs_info,
- "generation mismatch for (%llu,%d,%llu) %llu != %llu",
- key.objectid, key.type, key.offset,
- rec->generation, generation);
- }
-#endif
- if (rec->generation == generation &&
- btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
- btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
- reada_add_block(rc, bytenr, &next_key,
- btrfs_header_owner(eb), n_gen,
- btrfs_header_level(eb) - 1);
- }
- }
-
-cleanup:
- /*
- * free extctl records
- */
- while (!list_empty(&list)) {
- struct reada_control *rc;
- struct reada_extctl *rec;
-
- rec = list_first_entry(&list, struct reada_extctl, list);
- list_del(&rec->list);
- rc = rec->rc;
- kfree(rec);
-
- kref_get(&rc->refcnt);
- if (atomic_dec_and_test(&rc->elems)) {
- kref_put(&rc->refcnt, reada_control_release);
- wake_up(&rc->wait);
- }
- kref_put(&rc->refcnt, reada_control_release);
-
- reada_extent_put(fs_info, re); /* one ref for each entry */
- }
-
- return;
-}
-
-int btree_readahead_hook(struct extent_buffer *eb, int err)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int ret = 0;
- struct reada_extent *re;
-
- /* find extent */
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree,
- eb->start >> fs_info->sectorsize_bits);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
- if (!re) {
- ret = -1;
- goto start_machine;
- }
-
- __readahead_hook(fs_info, re, eb, err);
- reada_extent_put(fs_info, re); /* our ref */
-
-start_machine:
- reada_start_machine(fs_info);
- return ret;
-}
-
-static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
- struct btrfs_io_context *bioc)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
- int ret;
- struct reada_zone *zone;
- struct btrfs_block_group *cache = NULL;
- u64 start;
- u64 end;
- int i;
-
- zone = NULL;
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> fs_info->sectorsize_bits, 1);
- if (ret == 1 && logical >= zone->start && logical <= zone->end) {
- kref_get(&zone->refcnt);
- spin_unlock(&fs_info->reada_lock);
- return zone;
- }
-
- spin_unlock(&fs_info->reada_lock);
-
- cache = btrfs_lookup_block_group(fs_info, logical);
- if (!cache)
- return NULL;
-
- start = cache->start;
- end = start + cache->length - 1;
- btrfs_put_block_group(cache);
-
- zone = kzalloc(sizeof(*zone), GFP_KERNEL);
- if (!zone)
- return NULL;
-
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret) {
- kfree(zone);
- return NULL;
- }
-
- zone->start = start;
- zone->end = end;
- INIT_LIST_HEAD(&zone->list);
- spin_lock_init(&zone->lock);
- zone->locked = 0;
- kref_init(&zone->refcnt);
- zone->elems = 0;
- zone->device = dev; /* our device always sits at index 0 */
- for (i = 0; i < bioc->num_stripes; ++i) {
- /* bounds have already been checked */
- zone->devs[i] = bioc->stripes[i].dev;
- }
- zone->ndevs = bioc->num_stripes;
-
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)(zone->end >> fs_info->sectorsize_bits),
- zone);
-
- if (ret == -EEXIST) {
- kfree(zone);
- ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> fs_info->sectorsize_bits, 1);
- if (ret == 1 && logical >= zone->start && logical <= zone->end)
- kref_get(&zone->refcnt);
- else
- zone = NULL;
- }
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
-
- return zone;
-}
-
-static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
- u64 logical,
- struct btrfs_key *top,
- u64 owner_root, int level)
-{
- int ret;
- struct reada_extent *re = NULL;
- struct reada_extent *re_exist = NULL;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_device *dev;
- struct btrfs_device *prev_dev;
- u64 length;
- int real_stripes;
- int nzones = 0;
- unsigned long index = logical >> fs_info->sectorsize_bits;
- int dev_replace_is_ongoing;
- int have_zone = 0;
-
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree, index);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
-
- if (re)
- return re;
-
- re = kzalloc(sizeof(*re), GFP_KERNEL);
- if (!re)
- return NULL;
-
- re->logical = logical;
- re->top = *top;
- INIT_LIST_HEAD(&re->extctl);
- spin_lock_init(&re->lock);
- re->refcnt = 1;
- re->owner_root = owner_root;
- re->level = level;
-
- /*
- * map block
- */
- length = fs_info->nodesize;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bioc, 0);
- if (ret || !bioc || length < fs_info->nodesize)
- goto error;
-
- if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
- btrfs_err(fs_info,
- "readahead: more than %d copies not supported",
- BTRFS_MAX_MIRRORS);
- goto error;
- }
-
- real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
- for (nzones = 0; nzones < real_stripes; ++nzones) {
- struct reada_zone *zone;
-
- dev = bioc->stripes[nzones].dev;
-
- /* cannot read ahead on missing device. */
- if (!dev->bdev)
- continue;
-
- zone = reada_find_zone(dev, logical, bioc);
- if (!zone)
- continue;
-
- re->zones[re->nzones++] = zone;
- spin_lock(&zone->lock);
- if (!zone->elems)
- kref_get(&zone->refcnt);
- ++zone->elems;
- spin_unlock(&zone->lock);
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
- if (re->nzones == 0) {
- /* not a single zone found, error and out */
- goto error;
- }
-
- /* Insert extent in reada tree + all per-device trees, all or nothing */
- down_read(&fs_info->dev_replace.rwsem);
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret) {
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
-
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_insert(&fs_info->reada_tree, index, re);
- if (ret == -EEXIST) {
- re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
- re_exist->refcnt++;
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- if (ret) {
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- radix_tree_preload_end();
- prev_dev = NULL;
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
- &fs_info->dev_replace);
- for (nzones = 0; nzones < re->nzones; ++nzones) {
- dev = re->zones[nzones]->device;
-
- if (dev == prev_dev) {
- /*
- * in case of DUP, just add the first zone. As both
- * are on the same device, there's nothing to gain
- * from adding both.
- * Also, it wouldn't work, as the tree is per device
- * and adding would fail with EEXIST
- */
- continue;
- }
- if (!dev->bdev)
- continue;
-
- if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
- continue;
-
- if (dev_replace_is_ongoing &&
- dev == fs_info->dev_replace.tgtdev) {
- /*
- * as this device is selected for reading only as
- * a last resort, skip it for read ahead.
- */
- continue;
- }
- prev_dev = dev;
- ret = radix_tree_insert(&dev->reada_extents, index, re);
- if (ret) {
- while (--nzones >= 0) {
- dev = re->zones[nzones]->device;
- BUG_ON(dev == NULL);
- /* ignore whether the entry was inserted */
- radix_tree_delete(&dev->reada_extents, index);
- }
- radix_tree_delete(&fs_info->reada_tree, index);
- spin_unlock(&fs_info->reada_lock);
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- have_zone = 1;
- }
- if (!have_zone)
- radix_tree_delete(&fs_info->reada_tree, index);
- spin_unlock(&fs_info->reada_lock);
- up_read(&fs_info->dev_replace.rwsem);
-
- if (!have_zone)
- goto error;
-
- btrfs_put_bioc(bioc);
- return re;
-
-error:
- for (nzones = 0; nzones < re->nzones; ++nzones) {
- struct reada_zone *zone;
-
- zone = re->zones[nzones];
- kref_get(&zone->refcnt);
- spin_lock(&zone->lock);
- --zone->elems;
- if (zone->elems == 0) {
- /*
- * no fs_info->reada_lock needed, as this can't be
- * the last ref
- */
- kref_put(&zone->refcnt, reada_zone_release);
- }
- spin_unlock(&zone->lock);
-
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
- btrfs_put_bioc(bioc);
- kfree(re);
- return re_exist;
-}
-
-static void reada_extent_put(struct btrfs_fs_info *fs_info,
- struct reada_extent *re)
-{
- int i;
- unsigned long index = re->logical >> fs_info->sectorsize_bits;
-
- spin_lock(&fs_info->reada_lock);
- if (--re->refcnt) {
- spin_unlock(&fs_info->reada_lock);
- return;
- }
-
- radix_tree_delete(&fs_info->reada_tree, index);
- for (i = 0; i < re->nzones; ++i) {
- struct reada_zone *zone = re->zones[i];
-
- radix_tree_delete(&zone->device->reada_extents, index);
- }
-
- spin_unlock(&fs_info->reada_lock);
-
- for (i = 0; i < re->nzones; ++i) {
- struct reada_zone *zone = re->zones[i];
-
- kref_get(&zone->refcnt);
- spin_lock(&zone->lock);
- --zone->elems;
- if (zone->elems == 0) {
- /* no fs_info->reada_lock needed, as this can't be
- * the last ref */
- kref_put(&zone->refcnt, reada_zone_release);
- }
- spin_unlock(&zone->lock);
-
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
-
- kfree(re);
-}
-
-static void reada_zone_release(struct kref *kref)
-{
- struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
- struct btrfs_fs_info *fs_info = zone->device->fs_info;
-
- lockdep_assert_held(&fs_info->reada_lock);
-
- radix_tree_delete(&zone->device->reada_zones,
- zone->end >> fs_info->sectorsize_bits);
-
- kfree(zone);
-}
-
-static void reada_control_release(struct kref *kref)
-{
- struct reada_control *rc = container_of(kref, struct reada_control,
- refcnt);
-
- kfree(rc);
-}
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 owner_root,
- u64 generation, int level)
-{
- struct btrfs_fs_info *fs_info = rc->fs_info;
- struct reada_extent *re;
- struct reada_extctl *rec;
-
- /* takes one ref */
- re = reada_find_extent(fs_info, logical, top, owner_root, level);
- if (!re)
- return -1;
-
- rec = kzalloc(sizeof(*rec), GFP_KERNEL);
- if (!rec) {
- reada_extent_put(fs_info, re);
- return -ENOMEM;
- }
-
- rec->rc = rc;
- rec->generation = generation;
- atomic_inc(&rc->elems);
-
- spin_lock(&re->lock);
- list_add_tail(&rec->list, &re->extctl);
- spin_unlock(&re->lock);
-
- /* leave the ref on the extent */
-
- return 0;
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
-{
- int i;
- unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
-
- for (i = 0; i < zone->ndevs; ++i) {
- struct reada_zone *peer;
- peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
- if (peer && peer->device != zone->device)
- peer->locked = lock;
- }
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static int reada_pick_zone(struct btrfs_device *dev)
-{
- struct reada_zone *top_zone = NULL;
- struct reada_zone *top_locked_zone = NULL;
- u64 top_elems = 0;
- u64 top_locked_elems = 0;
- unsigned long index = 0;
- int ret;
-
- if (dev->reada_curr_zone) {
- reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
- kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
- dev->reada_curr_zone = NULL;
- }
- /* pick the zone with the most elements */
- while (1) {
- struct reada_zone *zone;
-
- ret = radix_tree_gang_lookup(&dev->reada_zones,
- (void **)&zone, index, 1);
- if (ret == 0)
- break;
- index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
- if (zone->locked) {
- if (zone->elems > top_locked_elems) {
- top_locked_elems = zone->elems;
- top_locked_zone = zone;
- }
- } else {
- if (zone->elems > top_elems) {
- top_elems = zone->elems;
- top_zone = zone;
- }
- }
- }
- if (top_zone)
- dev->reada_curr_zone = top_zone;
- else if (top_locked_zone)
- dev->reada_curr_zone = top_locked_zone;
- else
- return 0;
-
- dev->reada_next = dev->reada_curr_zone->start;
- kref_get(&dev->reada_curr_zone->refcnt);
- reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
-
- return 1;
-}
-
-static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 owner_root, int level, int mirror_num,
- struct extent_buffer **eb)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
- if (IS_ERR(buf))
- return 0;
-
- set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
-
- ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
- if (ret) {
- free_extent_buffer_stale(buf);
- return ret;
- }
-
- if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
- free_extent_buffer_stale(buf);
- return -EIO;
- } else if (extent_buffer_uptodate(buf)) {
- *eb = buf;
- } else {
- free_extent_buffer(buf);
- }
- return 0;
-}
-
-static int reada_start_machine_dev(struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
- struct reada_extent *re = NULL;
- int mirror_num = 0;
- struct extent_buffer *eb = NULL;
- u64 logical;
- int ret;
- int i;
-
- spin_lock(&fs_info->reada_lock);
- if (dev->reada_curr_zone == NULL) {
- ret = reada_pick_zone(dev);
- if (!ret) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- }
- /*
- * FIXME currently we issue the reads one extent at a time. If we have
- * a contiguous block of extents, we could also coagulate them or use
- * plugging to speed things up
- */
- ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> fs_info->sectorsize_bits, 1);
- if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
- ret = reada_pick_zone(dev);
- if (!ret) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- re = NULL;
- ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> fs_info->sectorsize_bits, 1);
- }
- if (ret == 0) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- dev->reada_next = re->logical + fs_info->nodesize;
- re->refcnt++;
-
- spin_unlock(&fs_info->reada_lock);
-
- spin_lock(&re->lock);
- if (re->scheduled || list_empty(&re->extctl)) {
- spin_unlock(&re->lock);
- reada_extent_put(fs_info, re);
- return 0;
- }
- re->scheduled = 1;
- spin_unlock(&re->lock);
-
- /*
- * find mirror num
- */
- for (i = 0; i < re->nzones; ++i) {
- if (re->zones[i]->device == dev) {
- mirror_num = i + 1;
- break;
- }
- }
- logical = re->logical;
-
- atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info, logical, re->owner_root,
- re->level, mirror_num, &eb);
- if (ret)
- __readahead_hook(fs_info, re, NULL, ret);
- else if (eb)
- __readahead_hook(fs_info, re, eb, ret);
-
- if (eb)
- free_extent_buffer(eb);
-
- atomic_dec(&dev->reada_in_flight);
- reada_extent_put(fs_info, re);
-
- return 1;
-
-}
-
-static void reada_start_machine_worker(struct btrfs_work *work)
-{
- struct reada_machine_work *rmw;
- int old_ioprio;
-
- rmw = container_of(work, struct reada_machine_work, work);
-
- old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
- task_nice_ioprio(current));
- set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(rmw->fs_info);
- set_task_ioprio(current, old_ioprio);
-
- atomic_dec(&rmw->fs_info->reada_works_cnt);
-
- kfree(rmw);
-}
-
-/* Try to start up to 10k READA requests for a group of devices */
-static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
-{
- u64 enqueued;
- u64 total = 0;
- struct btrfs_device *device;
-
- do {
- enqueued = 0;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (atomic_read(&device->reada_in_flight) <
- MAX_IN_FLIGHT)
- enqueued += reada_start_machine_dev(device);
- }
- total += enqueued;
- } while (enqueued && total < 10000);
-
- return total;
-}
-
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
- int i;
- u64 enqueued = 0;
-
- mutex_lock(&fs_devices->device_list_mutex);
-
- enqueued += reada_start_for_fsdevs(fs_devices);
- list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
- enqueued += reada_start_for_fsdevs(seed_devs);
-
- mutex_unlock(&fs_devices->device_list_mutex);
- if (enqueued == 0)
- return;
-
- /*
- * If everything is already in the cache, this is effectively single
- * threaded. To a) not hold the caller for too long and b) to utilize
- * more cores, we broke the loop above after 10000 iterations and now
- * enqueue to workers to finish it. This will distribute the load to
- * the cores.
- */
- for (i = 0; i < 2; ++i) {
- reada_start_machine(fs_info);
- if (atomic_read(&fs_info->reada_works_cnt) >
- BTRFS_MAX_MIRRORS * 2)
- break;
- }
-}
-
-static void reada_start_machine(struct btrfs_fs_info *fs_info)
-{
- struct reada_machine_work *rmw;
-
- rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
- if (!rmw) {
- /* FIXME we cannot handle this properly right now */
- BUG();
- }
- btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
- rmw->fs_info = fs_info;
-
- btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
- atomic_inc(&fs_info->reada_works_cnt);
-}
-
-#ifdef DEBUG
-static void dump_devs(struct btrfs_fs_info *fs_info, int all)
-{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- unsigned long index;
- int ret;
- int i;
- int j;
- int cnt;
-
- spin_lock(&fs_info->reada_lock);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
- atomic_read(&device->reada_in_flight));
- index = 0;
- while (1) {
- struct reada_zone *zone;
- ret = radix_tree_gang_lookup(&device->reada_zones,
- (void **)&zone, index, 1);
- if (ret == 0)
- break;
- pr_debug(" zone %llu-%llu elems %llu locked %d devs",
- zone->start, zone->end, zone->elems,
- zone->locked);
- for (j = 0; j < zone->ndevs; ++j) {
- pr_cont(" %lld",
- zone->devs[j]->devid);
- }
- if (device->reada_curr_zone == zone)
- pr_cont(" curr off %llu",
- device->reada_next - zone->start);
- pr_cont("\n");
- index = (zone->end >> fs_info->sectorsize_bits) + 1;
- }
- cnt = 0;
- index = 0;
- while (all) {
- struct reada_extent *re = NULL;
-
- ret = radix_tree_gang_lookup(&device->reada_extents,
- (void **)&re, index, 1);
- if (ret == 0)
- break;
- pr_debug(" re: logical %llu size %u empty %d scheduled %d",
- re->logical, fs_info->nodesize,
- list_empty(&re->extctl), re->scheduled);
-
- for (i = 0; i < re->nzones; ++i) {
- pr_cont(" zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- pr_cont(" %lld",
- re->zones[i]->devs[j]->devid);
- }
- }
- pr_cont("\n");
- index = (re->logical >> fs_info->sectorsize_bits) + 1;
- if (++cnt > 15)
- break;
- }
- }
-
- index = 0;
- cnt = 0;
- while (all) {
- struct reada_extent *re = NULL;
-
- ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
- index, 1);
- if (ret == 0)
- break;
- if (!re->scheduled) {
- index = (re->logical >> fs_info->sectorsize_bits) + 1;
- continue;
- }
- pr_debug("re: logical %llu size %u list empty %d scheduled %d",
- re->logical, fs_info->nodesize,
- list_empty(&re->extctl), re->scheduled);
- for (i = 0; i < re->nzones; ++i) {
- pr_cont(" zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- pr_cont(" %lld",
- re->zones[i]->devs[j]->devid);
- }
- }
- pr_cont("\n");
- index = (re->logical >> fs_info->sectorsize_bits) + 1;
- }
- spin_unlock(&fs_info->reada_lock);
-}
-#endif
-
-/*
- * interface
- */
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *key_start, struct btrfs_key *key_end)
-{
- struct reada_control *rc;
- u64 start;
- u64 generation;
- int ret;
- int level;
- struct extent_buffer *node;
- static struct btrfs_key max_key = {
- .objectid = (u64)-1,
- .type = (u8)-1,
- .offset = (u64)-1
- };
-
- rc = kzalloc(sizeof(*rc), GFP_KERNEL);
- if (!rc)
- return ERR_PTR(-ENOMEM);
-
- rc->fs_info = root->fs_info;
- rc->key_start = *key_start;
- rc->key_end = *key_end;
- atomic_set(&rc->elems, 0);
- init_waitqueue_head(&rc->wait);
- kref_init(&rc->refcnt);
- kref_get(&rc->refcnt); /* one ref for having elements */
-
- node = btrfs_root_node(root);
- start = node->start;
- generation = btrfs_header_generation(node);
- level = btrfs_header_level(node);
- free_extent_buffer(node);
-
- ret = reada_add_block(rc, start, &max_key, root->root_key.objectid,
- generation, level);
- if (ret) {
- kfree(rc);
- return ERR_PTR(ret);
- }
-
- reada_start_machine(root->fs_info);
-
- return rc;
-}
-
-#ifdef DEBUG
-int btrfs_reada_wait(void *handle)
-{
- struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->fs_info;
-
- while (atomic_read(&rc->elems)) {
- if (!atomic_read(&fs_info->reada_works_cnt))
- reada_start_machine(fs_info);
- wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
- 5 * HZ);
- dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
- }
-
- dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
-
- kref_put(&rc->refcnt, reada_control_release);
-
- return 0;
-}
-#else
-int btrfs_reada_wait(void *handle)
-{
- struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->fs_info;
-
- while (atomic_read(&rc->elems)) {
- if (!atomic_read(&fs_info->reada_works_cnt))
- reada_start_machine(fs_info);
- wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
- (HZ + 9) / 10);
- }
-
- kref_put(&rc->refcnt, reada_control_release);
-
- return 0;
-}
-#endif
-
-void btrfs_reada_detach(void *handle)
-{
- struct reada_control *rc = handle;
-
- kref_put(&rc->refcnt, reada_control_release);
-}
-
-/*
- * Before removing a device (device replace or device remove ioctls), call this
- * function to wait for all existing readahead requests on the device and to
- * make sure no one queues more readahead requests for the device.
- *
- * Must be called without holding neither the device list mutex nor the device
- * replace semaphore, otherwise it will deadlock.
- */
-void btrfs_reada_remove_dev(struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
-
- /* Serialize with readahead extent creation at reada_find_extent(). */
- spin_lock(&fs_info->reada_lock);
- set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
- spin_unlock(&fs_info->reada_lock);
-
- /*
- * There might be readahead requests added to the radix trees which
- * were not yet added to the readahead work queue. We need to start
- * them and wait for their completion, otherwise we can end up with
- * use-after-free problems when dropping the last reference on the
- * readahead extents and their zones, as they need to access the
- * device structure.
- */
- reada_start_machine(fs_info);
- btrfs_flush_workqueue(fs_info->readahead_workers);
-}
-
-/*
- * If when removing a device (device replace or device remove ioctls) an error
- * happens after calling btrfs_reada_remove_dev(), call this to undo what that
- * function did. This is safe to call even if btrfs_reada_remove_dev() was not
- * called before.
- */
-void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
-{
- spin_lock(&dev->fs_info->reada_lock);
- clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
- spin_unlock(&dev->fs_info->reada_lock);
-}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e2b9f8616501..a248f46cfe72 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -435,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
struct extent_buffer *leaf = path->nodes[0];
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
int type, ret;
@@ -972,6 +972,7 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
/* Walk down all roots and build the ref tree, meant to be called at mount */
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *eb;
int tree_block_level = 0;
@@ -985,7 +986,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- eb = btrfs_read_lock_root_node(fs_info->extent_root);
+ extent_root = btrfs_extent_root(fs_info, 0);
+ eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
@@ -998,7 +1000,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
* would have had to added a ref key item which may appear on a
* different leaf from the original extent item.
*/
- ret = walk_down_tree(fs_info->extent_root, path, level,
+ ret = walk_down_tree(extent_root, path, level,
&bytenr, &num_bytes, &tree_block_level);
if (ret)
break;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index e0f93b357548..a3930da4eb3f 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -439,7 +439,7 @@ process_slot:
break;
}
next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
+ size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 33a0ee7ac590..f5465197996d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -26,6 +26,7 @@
#include "misc.h"
#include "subpage.h"
#include "zoned.h"
+#include "inode-item.h"
/*
* Relocation overview
@@ -1736,7 +1737,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
memset(&next_key, 0, sizeof(next_key));
while (1) {
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ min_reserved,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret)
goto out;
@@ -1855,7 +1857,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
err = ret;
@@ -2323,8 +2325,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* If we get an enospc just kick back -EAGAIN so we know to drop the
* transaction and try to refill when we can flush all the things.
*/
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_LIMIT);
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
while (tmp <= rc->reserved_bytes)
@@ -3149,7 +3151,7 @@ static int add_tree_block(struct reloc_control *rc,
u64 owner = 0;
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
item_size >= sizeof(*ei) + sizeof(*bi)) {
@@ -3550,7 +3552,7 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
RELOCATION_RESERVED_NODES;
- ret = btrfs_block_rsv_refill(rc->extent_root,
+ ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
rc->block_rsv, rc->block_rsv->size,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
@@ -3598,9 +3600,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
while (1) {
rc->reserved_bytes = 0;
- ret = btrfs_block_rsv_refill(rc->extent_root,
- rc->block_rsv, rc->block_rsv->size,
- BTRFS_RESERVE_FLUSH_ALL);
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
err = ret;
break;
@@ -3858,25 +3860,14 @@ out:
* 0 success
* -EINPROGRESS operation is already in progress, that's probably a bug
* -ECANCELED cancellation request was set before the operation started
- * -EAGAIN can not start because there are ongoing send operations
*/
static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
{
- spin_lock(&fs_info->send_reloc_lock);
- if (fs_info->send_in_progress) {
- btrfs_warn_rl(fs_info,
-"cannot run relocation while send operations are in progress (%d in progress)",
- fs_info->send_in_progress);
- spin_unlock(&fs_info->send_reloc_lock);
- return -EAGAIN;
- }
if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
/* This should not happen */
- spin_unlock(&fs_info->send_reloc_lock);
btrfs_err(fs_info, "reloc already running, cannot start");
return -EINPROGRESS;
}
- spin_unlock(&fs_info->send_reloc_lock);
if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
btrfs_info(fs_info, "chunk relocation canceled on start");
@@ -3898,9 +3889,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
/* Requested after start, clear bit first so any waiters can continue */
if (atomic_read(&fs_info->reloc_cancel_req) > 0)
btrfs_info(fs_info, "chunk relocation canceled during operation");
- spin_lock(&fs_info->send_reloc_lock);
clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
- spin_unlock(&fs_info->send_reloc_lock);
atomic_set(&fs_info->reloc_cancel_req, 0);
}
@@ -3963,7 +3952,7 @@ static const char *stage_to_string(int stage)
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
struct btrfs_block_group *bg;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
struct reloc_control *rc;
struct inode *inode;
struct btrfs_path *path;
@@ -4214,7 +4203,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_end;
}
- rc->extent_root = fs_info->extent_root;
+ rc->extent_root = btrfs_extent_root(fs_info, 0);
set_reloc_control(rc);
@@ -4305,6 +4294,7 @@ out:
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *csum_root;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
@@ -4316,7 +4306,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
disk_bytenr = file_pos + inode->index_cnt;
- ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 12ceb14a1141..3d68d2dcd83e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -25,7 +25,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
u32 len;
int need_reset = 0;
- len = btrfs_item_size_nr(eb, slot);
+ len = btrfs_item_size(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
@@ -146,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
- old_len = btrfs_item_size_nr(l, slot);
+ old_len = btrfs_item_size(l, slot);
/*
* If this is the first time we update the root item which originated
@@ -334,7 +334,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -501,7 +502,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cf82ea6f54fb..2e9a322773f2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -39,21 +39,20 @@ struct scrub_block;
struct scrub_ctx;
/*
- * the following three values only influence the performance.
+ * The following three values only influence the performance.
+ *
* The last one configures the number of parallel and outstanding I/O
- * operations. The first two values configure an upper limit for the number
+ * operations. The first one configures an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
-#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
-#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
+#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */
/*
- * the following value times PAGE_SIZE needs to be large enough to match the
+ * The following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
- * Values larger than BTRFS_STRIPE_LEN are not supported.
*/
-#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
+#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
struct scrub_recover {
refcount_t refs;
@@ -73,8 +72,8 @@ struct scrub_page {
u64 physical_for_dev_replace;
atomic_t refs;
u8 mirror_num;
- int have_csum:1;
- int io_error:1;
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
@@ -88,11 +87,7 @@ struct scrub_bio {
blk_status_t status;
u64 logical;
u64 physical;
-#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
- struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
-#else
- struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
-#endif
+ struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
int page_count;
int next_free;
struct btrfs_work work;
@@ -163,7 +158,7 @@ struct scrub_ctx {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_rd_bio;
+ int pages_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -174,7 +169,6 @@ struct scrub_ctx {
struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
- int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
struct btrfs_device *wr_tgtdev;
bool flush_all_writes;
@@ -578,7 +572,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+ sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
@@ -616,7 +610,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sctx->wr_curr_bio = NULL;
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);
- sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}
@@ -758,7 +751,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
@@ -852,8 +845,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
- if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
- return btrfs_repair_one_zone(fs_info, logical);
+ if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
+ return 0;
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
@@ -1313,7 +1306,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
recover->bioc = bioc;
recover->map_length = mapped_length;
- BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
@@ -1675,7 +1668,7 @@ again:
sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_bio);
sbio->bio = bio;
}
@@ -1708,7 +1701,7 @@ again:
sbio->pagev[sbio->page_count] = spage;
scrub_page_get(spage);
sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_wr_bio)
+ if (sbio->page_count == sctx->pages_per_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
@@ -1755,7 +1748,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
struct scrub_ctx *sctx = sbio->sctx;
int i;
- WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+ ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
@@ -2101,7 +2094,7 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
+ bio = btrfs_bio_alloc(sctx->pages_per_bio);
sbio->bio = bio;
}
@@ -2135,7 +2128,7 @@ again:
scrub_block_get(sblock); /* one for the page added to the bio */
atomic_inc(&sblock->outstanding_pages);
sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_rd_bio)
+ if (sbio->page_count == sctx->pages_per_bio)
scrub_submit(sctx);
return 0;
@@ -2297,7 +2290,7 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
scrub_page_get(spage);
sblock->pagev[index] = spage;
spage->sblock = sblock;
@@ -2369,7 +2362,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
struct scrub_ctx *sctx = sbio->sctx;
int i;
- BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+ ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
if (sbio->status) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
@@ -2631,7 +2624,7 @@ leave_nomem:
scrub_block_put(sblock);
return -ENOMEM;
}
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
/* For scrub block */
scrub_page_get(spage);
sblock->pagev[index] = spage;
@@ -2892,15 +2885,15 @@ static void scrub_parity_put(struct scrub_parity *sparity)
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
- struct btrfs_path *path,
u64 logic_start,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
+ struct btrfs_root *csum_root;
struct btrfs_extent_item *extent;
struct btrfs_io_context *bioc = NULL;
+ struct btrfs_path *path;
u64 flags;
int ret;
int slot;
@@ -2919,6 +2912,16 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
ASSERT(map->stripe_len <= U32_MAX);
nsectors = map->stripe_len >> fs_info->sectorsize_bits;
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
@@ -2928,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
+ btrfs_free_path(path);
return -ENOMEM;
}
@@ -3060,6 +3064,7 @@ again:
extent_dev = bioc->stripes[0].dev;
btrfs_put_bioc(bioc);
+ csum_root = btrfs_csum_root(fs_info, extent_logical);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
extent_logical + extent_len - 1,
@@ -3116,7 +3121,7 @@ out:
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
- btrfs_release_path(path);
+ btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
@@ -3161,17 +3166,18 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
}
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length,
- struct btrfs_block_group *cache)
+ int stripe_index, u64 dev_extent_len)
{
- struct btrfs_path *path, *ppath;
+ struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_root *root;
+ struct btrfs_root *csum_root;
struct btrfs_extent_item *extent;
struct blk_plug plug;
+ const u64 chunk_logical = bg->start;
u64 flags;
int ret;
int slot;
@@ -3183,10 +3189,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 physical_end;
u64 generation;
int mirror_num;
- struct reada_control *reada1;
- struct reada_control *reada2;
struct btrfs_key key;
- struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
u64 extent_logical;
@@ -3202,25 +3205,26 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- physical = map->stripes[num].physical;
+ physical = map->stripes[stripe_index].physical;
offset = 0;
- nstripes = div64_u64(length, map->stripe_len);
+ nstripes = div64_u64(dev_extent_len, map->stripe_len);
mirror_num = 1;
increment = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- offset = map->stripe_len * num;
+ offset = map->stripe_len * stripe_index;
increment = map->stripe_len * map->num_stripes;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
- offset = map->stripe_len * (num / map->sub_stripes);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
increment = map->stripe_len * factor;
- mirror_num = num % map->sub_stripes + 1;
+ mirror_num = stripe_index % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- mirror_num = num % map->num_stripes + 1;
+ mirror_num = stripe_index % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- mirror_num = num % map->num_stripes + 1;
+ mirror_num = stripe_index % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical, num, map, &offset, NULL);
+ get_raid56_logic_offset(physical, stripe_index, map, &offset,
+ NULL);
increment = map->stripe_len * nr_data_stripes(map);
}
@@ -3228,12 +3232,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
- ppath = btrfs_alloc_path();
- if (!ppath) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
-
/*
* work on commit root. The related disk blocks are static as
* long as COW is applied. This means, it is save to rewrite
@@ -3241,20 +3239,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
path->search_commit_root = 1;
path->skip_locking = 1;
+ path->reada = READA_FORWARD;
- ppath->search_commit_root = 1;
- ppath->skip_locking = 1;
- /*
- * trigger the readahead for extent tree csum tree and wait for
- * completion. During readahead, the scrub is officially paused
- * to not hold off transaction commits
- */
- logical = base + offset;
+ logical = chunk_logical + offset;
physical_end = physical + nstripes * map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical_end, num,
+ get_raid56_logic_offset(physical_end, stripe_index,
map, &logic_end, NULL);
- logic_end += base;
+ logic_end += chunk_logical;
} else {
logic_end = logical + increment * nstripes;
}
@@ -3262,32 +3254,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- /* FIXME it might be better to start readahead at commit root */
- key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
- key_end.objectid = logic_end;
- key_end.type = BTRFS_METADATA_ITEM_KEY;
- key_end.offset = (u64)-1;
- reada1 = btrfs_reada_add(root, &key, &key_end);
-
- if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
- key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.type = BTRFS_EXTENT_CSUM_KEY;
- key.offset = logical;
- key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key, &key_end);
- } else {
- reada2 = NULL;
- }
-
- if (!IS_ERR(reada1))
- btrfs_reada_wait(reada1);
- if (!IS_ERR_OR_NULL(reada2))
- btrfs_reada_wait(reada2);
-
+ root = btrfs_extent_root(fs_info, logical);
+ csum_root = btrfs_csum_root(fs_info, logical);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -3333,16 +3301,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, num, map,
- &logical,
+ ret = get_raid56_logic_offset(physical, stripe_index,
+ map, &logical,
&stripe_logical);
- logical += base;
+ logical += chunk_logical;
if (ret) {
/* it is parity strip */
- stripe_logical += base;
+ stripe_logical += chunk_logical;
stripe_end = stripe_logical + increment;
ret = scrub_raid56_parity(sctx, map, scrub_dev,
- ppath, stripe_logical,
+ stripe_logical,
stripe_end);
if (ret)
goto out;
@@ -3419,13 +3387,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Continuing would prevent reusing its device extents
* for new block groups for a long time.
*/
- spin_lock(&cache->lock);
- if (cache->removed) {
- spin_unlock(&cache->lock);
+ spin_lock(&bg->lock);
+ if (bg->removed) {
+ spin_unlock(&bg->lock);
ret = 0;
goto out;
}
- spin_unlock(&cache->lock);
+ spin_unlock(&bg->lock);
extent = btrfs_item_ptr(l, slot,
struct btrfs_extent_item);
@@ -3504,16 +3472,16 @@ again:
loop:
physical += map->stripe_len;
ret = get_raid56_logic_offset(physical,
- num, map, &logical,
- &stripe_logical);
- logical += base;
+ stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
if (ret && physical < physical_end) {
- stripe_logical += base;
+ stripe_logical += chunk_logical;
stripe_end = stripe_logical +
increment;
ret = scrub_raid56_parity(sctx,
- map, scrub_dev, ppath,
+ map, scrub_dev,
stripe_logical,
stripe_end);
if (ret)
@@ -3543,8 +3511,8 @@ skip:
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
- sctx->stat.last_physical = map->stripes[num].physical +
- length;
+ sctx->stat.last_physical = map->stripes[stripe_index].physical +
+ dev_extent_len;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
@@ -3560,14 +3528,14 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
- btrfs_free_path(ppath);
if (sctx->is_dev_replace && ret >= 0) {
int ret2;
- ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
- map->stripes[num].physical,
- physical_end);
+ ret2 = sync_write_pointer_for_zoned(sctx,
+ chunk_logical + offset,
+ map->stripes[stripe_index].physical,
+ physical_end);
if (ret2)
ret = ret2;
}
@@ -3576,10 +3544,10 @@ out:
}
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
struct btrfs_device *scrub_dev,
- u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group *cache)
+ u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
@@ -3589,7 +3557,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
int ret = 0;
read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, chunk_offset, 1);
+ em = lookup_extent_mapping(map_tree, bg->start, bg->length);
read_unlock(&map_tree->lock);
if (!em) {
@@ -3597,26 +3565,24 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
*/
- spin_lock(&cache->lock);
- if (!cache->removed)
+ spin_lock(&bg->lock);
+ if (!bg->removed)
ret = -EINVAL;
- spin_unlock(&cache->lock);
+ spin_unlock(&bg->lock);
return ret;
}
-
- map = em->map_lookup;
- if (em->start != chunk_offset)
+ if (em->start != bg->start)
goto out;
-
- if (em->len < length)
+ if (em->len < dev_extent_len)
goto out;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length, cache);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
+ dev_extent_len);
if (ret)
goto out;
}
@@ -3654,7 +3620,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- u64 length;
u64 chunk_offset;
int ret = 0;
int ro_set;
@@ -3678,6 +3643,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
key.type = BTRFS_DEV_EXTENT_KEY;
while (1) {
+ u64 dev_extent_len;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
break;
@@ -3714,9 +3681,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
- length = btrfs_dev_extent_length(l, dev_extent);
+ dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start)
+ if (found_key.offset + dev_extent_len <= start)
goto skip;
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
@@ -3850,13 +3817,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
- dev_replace->cursor_right = found_key.offset + length;
+ dev_replace->cursor_right = found_key.offset + dev_extent_len;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache);
+ ASSERT(cache->start == chunk_offset);
+ ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
+ dev_extent_len);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3937,7 +3905,7 @@ skip_unfreeze:
break;
}
skip:
- key.offset = found_key.offset + length;
+ key.offset = found_key.offset + dev_extent_len;
btrfs_release_path(path);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 040324d71118..d8ccb62aa7d2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,6 +24,7 @@
#include "transaction.h"
#include "compression.h"
#include "xattr.h"
+#include "print-tree.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -98,6 +99,15 @@ struct send_ctx {
struct btrfs_key *cmp_key;
/*
+ * Keep track of the generation of the last transaction that was used
+ * for relocating a block group. This is periodically checked in order
+ * to detect if a relocation happened since the last check, so that we
+ * don't operate on stale extent buffers for nodes (level >= 1) or on
+ * stale disk_bytenr values of file extent items.
+ */
+ u64 last_reloc_trans;
+
+ /*
* infos of the currently processed inode. In case of deleted inodes,
* these are the values from the deleted inode.
*/
@@ -898,7 +908,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iterate_inode_ref_t iterate, void *ctx)
{
struct extent_buffer *eb = path->nodes[0];
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
struct btrfs_path *tmp_path;
@@ -930,12 +939,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
if (found_key->type == BTRFS_INODE_REF_KEY) {
ptr = (unsigned long)btrfs_item_ptr(eb, slot,
struct btrfs_inode_ref);
- item = btrfs_item_nr(slot);
- total = btrfs_item_size(eb, item);
+ total = btrfs_item_size(eb, slot);
elem_size = sizeof(*iref);
} else {
ptr = btrfs_item_ptr_offset(eb, slot);
- total = btrfs_item_size_nr(eb, slot);
+ total = btrfs_item_size(eb, slot);
elem_size = sizeof(*extref);
}
@@ -1004,7 +1012,7 @@ out:
typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx);
+ void *ctx);
/*
* Helper function to iterate the entries in ONE btrfs_dir_item.
@@ -1018,7 +1026,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
{
int ret = 0;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
@@ -1030,7 +1037,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
u32 total;
int slot;
int num;
- u8 type;
/*
* Start with a small buffer (1 page). If later we end up needing more
@@ -1047,20 +1053,18 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
eb = path->nodes[0];
slot = path->slots[0];
- item = btrfs_item_nr(slot);
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
cur = 0;
len = 0;
- total = btrfs_item_size(eb, item);
+ total = btrfs_item_size(eb, slot);
num = 0;
while (cur < total) {
name_len = btrfs_dir_name_len(eb, di);
data_len = btrfs_dir_data_len(eb, di);
- type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (type == BTRFS_FT_XATTR) {
+ if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@@ -1110,7 +1114,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
cur += len;
ret = iterate(num, &di_key, buf, name_len, buf + name_len,
- data_len, type, ctx);
+ data_len, ctx);
if (ret < 0)
goto out;
if (ret) {
@@ -1427,6 +1431,26 @@ static int find_extent_clone(struct send_ctx *sctx,
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ /*
+ * A transaction commit for a transaction in which block group
+ * relocation was done just happened.
+ * The disk_bytenr of the file extent item we processed is
+ * possibly stale, referring to the extent's location before
+ * relocation. So act as if we haven't found any clone sources
+ * and fallback to write commands, which will read the correct
+ * data from the new extent location. Otherwise we will fail
+ * below because we haven't found our own back reference or we
+ * could be getting incorrect sources in case the old extent
+ * was already reallocated after the relocation.
+ */
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOENT;
+ goto out;
+ }
+ up_read(&fs_info->commit_root_sem);
+
if (!backref_ctx.found_itself) {
/* found a bug in backref code? */
ret = -EIO;
@@ -1692,8 +1716,7 @@ out:
*/
static int lookup_dir_item_inode(struct btrfs_root *root,
u64 dir, const char *name, int name_len,
- u64 *found_inode,
- u8 *found_type)
+ u64 *found_inode)
{
int ret = 0;
struct btrfs_dir_item *di;
@@ -1716,7 +1739,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
goto out;
}
*found_inode = key.objectid;
- *found_type = btrfs_dir_type(path->nodes[0], di);
out:
btrfs_free_path(path);
@@ -1839,7 +1861,6 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
int ret = 0;
u64 gen;
u64 other_inode = 0;
- u8 other_type = 0;
if (!sctx->parent_root)
goto out;
@@ -1867,7 +1888,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
}
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
- &other_inode, &other_type);
+ &other_inode);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1912,7 +1933,6 @@ static int did_overwrite_ref(struct send_ctx *sctx,
int ret = 0;
u64 gen;
u64 ow_inode;
- u8 other_type;
if (!sctx->parent_root)
goto out;
@@ -1936,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
/* check if the ref was overwritten by another ref */
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
- &ow_inode, &other_type);
+ &ow_inode);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -3622,7 +3642,7 @@ static int is_ancestor(struct btrfs_root *root,
key.type != BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
while (cur_offset < item_size) {
u64 parent;
u64 parent_gen;
@@ -4651,9 +4671,8 @@ out:
}
static int __process_new_xattr(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
+ const char *name, int name_len, const char *data,
+ int data_len, void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4697,8 +4716,7 @@ out:
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
+ const char *data, int data_len, void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4743,10 +4761,8 @@ struct find_xattr_ctx {
int found_data_len;
};
-static int __find_xattr(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *vctx)
+static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
+ int name_len, const char *data, int data_len, void *vctx)
{
struct find_xattr_ctx *ctx = vctx;
@@ -4796,7 +4812,7 @@ static int find_xattr(struct btrfs_root *root,
static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx)
+ void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4808,12 +4824,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
&found_data_len);
if (ret == -ENOENT) {
ret = __process_new_xattr(num, di_key, name, name_len, data,
- data_len, type, ctx);
+ data_len, ctx);
} else if (ret >= 0) {
if (data_len != found_data_len ||
memcmp(data, found_data, data_len)) {
ret = __process_new_xattr(num, di_key, name, name_len,
- data, data_len, type, ctx);
+ data, data_len, ctx);
} else {
ret = 0;
}
@@ -4826,7 +4842,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx)
+ void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4835,7 +4851,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
name, name_len, NULL, NULL);
if (ret == -ENOENT)
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
- data_len, type, ctx);
+ data_len, ctx);
else if (ret >= 0)
ret = 0;
@@ -6566,7 +6582,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *)(ptr +
@@ -6597,6 +6613,50 @@ static int changed_cb(struct btrfs_path *left_path,
{
int ret = 0;
+ /*
+ * We can not hold the commit root semaphore here. This is because in
+ * the case of sending and receiving to the same filesystem, using a
+ * pipe, could result in a deadlock:
+ *
+ * 1) The task running send blocks on the pipe because it's full;
+ *
+ * 2) The task running receive, which is the only consumer of the pipe,
+ * is waiting for a transaction commit (for example due to a space
+ * reservation when doing a write or triggering a transaction commit
+ * when creating a subvolume);
+ *
+ * 3) The transaction is waiting to write lock the commit root semaphore,
+ * but can not acquire it since it's being held at 1).
+ *
+ * Down this call chain we write to the pipe through kernel_write().
+ * The same type of problem can also happen when sending to a file that
+ * is stored in the same filesystem - when reserving space for a write
+ * into the file, we can trigger a transaction commit.
+ *
+ * Our caller has supplied us with clones of leaves from the send and
+ * parent roots, so we're safe here from a concurrent relocation and
+ * further reallocation of metadata extents while we are here. Below we
+ * also assert that the leaves are clones.
+ */
+ lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+ /*
+ * We always have a send root, so left_path is never NULL. We will not
+ * have a leaf when we have reached the end of the send root but have
+ * not yet reached the end of the parent root.
+ */
+ if (left_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &left_path->nodes[0]->bflags));
+ /*
+ * When doing a full send we don't have a parent root, so right_path is
+ * NULL. When doing an incremental send, we may have reached the end of
+ * the parent root already, so we don't have a leaf at right_path.
+ */
+ if (right_path && right_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &right_path->nodes[0]->bflags));
+
if (result == BTRFS_COMPARE_TREE_SAME) {
if (key->type == BTRFS_INODE_REF_KEY ||
key->type == BTRFS_INODE_EXTREF_KEY) {
@@ -6643,14 +6703,46 @@ out:
return ret;
}
+static int search_key_again(const struct send_ctx *sctx,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key)
+{
+ int ret;
+
+ if (!path->need_commit_sem)
+ lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+ /*
+ * Roots used for send operations are readonly and no one can add,
+ * update or remove keys from them, so we should be able to find our
+ * key again. The only exception is deduplication, which can operate on
+ * readonly roots and add, update or remove keys to/from them - but at
+ * the moment we don't allow it to run in parallel with send.
+ */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ ASSERT(ret <= 0);
+ if (ret > 0) {
+ btrfs_print_tree(path->nodes[path->lowest_level], false);
+ btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+ key->objectid, key->type, key->offset,
+ (root == sctx->parent_root ? "parent" : "send"),
+ root->root_key.objectid, path->lowest_level,
+ path->slots[path->lowest_level]);
+ return -EUCLEAN;
+ }
+
+ return ret;
+}
+
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
+ struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_path *path;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
@@ -6661,6 +6753,10 @@ static int full_send_tree(struct send_ctx *sctx)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
+ down_read(&fs_info->commit_root_sem);
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -6668,15 +6764,35 @@ static int full_send_tree(struct send_ctx *sctx)
goto out_finish;
while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &key, slot);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ret = changed_cb(path, NULL, &key,
BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+ /*
+ * A transaction used for relocating a block group was
+ * committed or is about to finish its commit. Release
+ * our path (leaf) and restart the search, so that we
+ * avoid operating on any file extent items that are
+ * stale, with a disk_bytenr that reflects a pre
+ * relocation value. This way we avoid as much as
+ * possible to fallback to regular writes when checking
+ * if we can clone file ranges.
+ */
+ btrfs_release_path(path);
+ ret = search_key_again(sctx, send_root, path, &key);
+ if (ret < 0)
+ goto out;
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+
ret = btrfs_next_item(send_root, path);
if (ret < 0)
goto out;
@@ -6694,6 +6810,20 @@ out:
return ret;
}
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+ struct extent_buffer *clone;
+
+ clone = btrfs_clone_extent_buffer(path->nodes[level]);
+ if (!clone)
+ return -ENOMEM;
+
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = clone;
+
+ return 0;
+}
+
static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
{
struct extent_buffer *eb;
@@ -6703,6 +6833,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
u64 reada_max;
u64 reada_done = 0;
+ lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+
BUG_ON(*level == 0);
eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
@@ -6726,6 +6858,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
+
+ if (*level == 0)
+ return replace_node_with_clone(path, 0);
+
return 0;
}
@@ -6739,8 +6875,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
path->slots[*level]++;
while (path->slots[*level] >= nritems) {
- if (*level == root_level)
+ if (*level == root_level) {
+ path->slots[*level] = nritems - 1;
return -1;
+ }
/* move upnext */
path->slots[*level] = 0;
@@ -6772,14 +6910,20 @@ static int tree_advance(struct btrfs_path *path,
} else {
ret = tree_move_down(path, level, reada_min_gen);
}
- if (ret >= 0) {
- if (*level == 0)
- btrfs_item_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- else
- btrfs_node_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- }
+
+ /*
+ * Even if we have reached the end of a tree, ret is -1, update the key
+ * anyway, so that in case we need to restart due to a block group
+ * relocation, we can assert that the last key of the root node still
+ * exists in the tree.
+ */
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+
return ret;
}
@@ -6791,8 +6935,8 @@ static int tree_compare_item(struct btrfs_path *left_path,
int len1, len2;
unsigned long off1, off2;
- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
if (len1 != len2)
return 1;
@@ -6809,6 +6953,97 @@ static int tree_compare_item(struct btrfs_path *left_path,
}
/*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ * any locks on them, which does not prevent them from having been relocated
+ * and reallocated after the last time we released the commit root semaphore.
+ * The exception are the root nodes, for which we always have a clone, see
+ * the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ * we are safe from the concurrent relocation and reallocation. However they
+ * can have file extent items with a pre relocation disk_bytenr value, so we
+ * restart the start from the current commit roots and clone the new leaves so
+ * that we get the post relocation disk_bytenr values. Not doing so, could
+ * make us clone the wrong data in case there are new extents using the old
+ * disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ const struct btrfs_key *left_key,
+ const struct btrfs_key *right_key,
+ int left_level,
+ int right_level,
+ const struct send_ctx *sctx)
+{
+ int root_level;
+ int ret;
+
+ lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+ btrfs_release_path(left_path);
+ btrfs_release_path(right_path);
+
+ /*
+ * Since keys can not be added or removed to/from our roots because they
+ * are readonly and we do not allow deduplication to run in parallel
+ * (which can add, remove or change keys), the layout of the trees should
+ * not change.
+ */
+ left_path->lowest_level = left_level;
+ ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+ if (ret < 0)
+ return ret;
+
+ right_path->lowest_level = right_level;
+ ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If the lowest level nodes are leaves, clone them so that they can be
+ * safely used by changed_cb() while not under the protection of the
+ * commit root semaphore, even if relocation and reallocation happens in
+ * parallel.
+ */
+ if (left_level == 0) {
+ ret = replace_node_with_clone(left_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (right_level == 0) {
+ ret = replace_node_with_clone(right_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * Now clone the root nodes (unless they happen to be the leaves we have
+ * already cloned). This is to protect against concurrent snapshotting of
+ * the send and parent roots (see the comment at btrfs_compare_trees()).
+ */
+ root_level = btrfs_header_level(sctx->send_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(left_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ root_level = btrfs_header_level(sctx->parent_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(right_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* This function compares two trees and calls the provided callback for
* every changed/new/deleted item it finds.
* If shared tree blocks are encountered, whole subtrees are skipped, making
@@ -6836,10 +7071,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
int right_root_level;
int left_level;
int right_level;
- int left_end_reached;
- int right_end_reached;
- int advance_left;
- int advance_right;
+ int left_end_reached = 0;
+ int right_end_reached = 0;
+ int advance_left = 0;
+ int advance_right = 0;
u64 left_blockptr;
u64 right_blockptr;
u64 left_gen;
@@ -6907,12 +7142,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
down_read(&fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
+ /*
+ * We clone the root node of the send and parent roots to prevent races
+ * with snapshot creation of these roots. Snapshot creation COWs the
+ * root node of a tree, so after the transaction is committed the old
+ * extent can be reallocated while this send operation is still ongoing.
+ * So we clone them, under the commit root semaphore, to be race free.
+ */
left_path->nodes[left_level] =
btrfs_clone_extent_buffer(left_root->commit_root);
if (!left_path->nodes[left_level]) {
- up_read(&fs_info->commit_root_sem);
ret = -ENOMEM;
- goto out;
+ goto out_unlock;
}
right_level = btrfs_header_level(right_root->commit_root);
@@ -6920,9 +7161,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->nodes[right_level] =
btrfs_clone_extent_buffer(right_root->commit_root);
if (!right_path->nodes[right_level]) {
- up_read(&fs_info->commit_root_sem);
ret = -ENOMEM;
- goto out;
+ goto out_unlock;
}
/*
* Our right root is the parent root, while the left root is the "send"
@@ -6932,7 +7172,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
* will need to read them at some point.
*/
reada_min_gen = btrfs_header_generation(right_root->commit_root);
- up_read(&fs_info->commit_root_sem);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -6947,11 +7186,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
btrfs_node_key_to_cpu(right_path->nodes[right_level],
&right_key, right_path->slots[right_level]);
- left_end_reached = right_end_reached = 0;
- advance_left = advance_right = 0;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
while (1) {
- cond_resched();
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ up_read(&fs_info->commit_root_sem);
+ cond_resched();
+ down_read(&fs_info->commit_root_sem);
+ }
+
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ ret = restart_after_relocation(left_path, right_path,
+ &left_key, &right_key,
+ left_level, right_level,
+ sctx);
+ if (ret < 0)
+ goto out_unlock;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ }
+
if (advance_left && !left_end_reached) {
ret = tree_advance(left_path, &left_level,
left_root_level,
@@ -6960,7 +7214,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
if (ret == -1)
left_end_reached = ADVANCE;
else if (ret < 0)
- goto out;
+ goto out_unlock;
advance_left = 0;
}
if (advance_right && !right_end_reached) {
@@ -6971,54 +7225,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
if (ret == -1)
right_end_reached = ADVANCE;
else if (ret < 0)
- goto out;
+ goto out_unlock;
advance_right = 0;
}
if (left_end_reached && right_end_reached) {
ret = 0;
- goto out;
+ goto out_unlock;
} else if (left_end_reached) {
if (right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
}
advance_right = ADVANCE;
continue;
} else if (right_end_reached) {
if (left_level == 0) {
+ up_read(&fs_info->commit_root_sem);
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
}
advance_left = ADVANCE;
continue;
}
if (left_level == 0 && right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
sctx);
- if (ret < 0)
- goto out;
advance_left = ADVANCE;
} else if (cmp > 0) {
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
sctx);
- if (ret < 0)
- goto out;
advance_right = ADVANCE;
} else {
enum btrfs_compare_tree_result result;
@@ -7032,11 +7287,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_path, right_path,
&left_key, result, sctx);
- if (ret < 0)
- goto out;
advance_left = ADVANCE;
advance_right = ADVANCE;
}
+
+ if (ret < 0)
+ goto out;
+ down_read(&fs_info->commit_root_sem);
} else if (left_level == right_level) {
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
@@ -7076,6 +7333,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
}
}
+out_unlock:
+ up_read(&fs_info->commit_root_sem);
out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
@@ -7425,21 +7684,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (ret)
goto out;
- spin_lock(&fs_info->send_reloc_lock);
- if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
- spin_unlock(&fs_info->send_reloc_lock);
- btrfs_warn_rl(fs_info,
- "cannot run send because a relocation operation is in progress");
- ret = -EAGAIN;
- goto out;
- }
- fs_info->send_in_progress++;
- spin_unlock(&fs_info->send_reloc_lock);
-
ret = send_subvol(sctx);
- spin_lock(&fs_info->send_reloc_lock);
- fs_info->send_in_progress--;
- spin_unlock(&fs_info->send_reloc_lock);
if (ret < 0)
goto out;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 48d77f360a24..294242c194d8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -617,7 +617,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
enum btrfs_flush_state state, bool for_preempt)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
@@ -844,6 +844,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
+ if (!ticket->steal)
+ return false;
+
if (global_rsv->space_info != space_info)
return false;
@@ -899,8 +902,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (!aborted && ticket->steal &&
- steal_from_global_rsv(fs_info, space_info, ticket))
+ if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
return true;
if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
@@ -1260,18 +1262,23 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int states_nr)
{
u64 to_reclaim;
- int flush_state;
+ int flush_state = 0;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
- if (!to_reclaim) {
+ /*
+ * This is the priority reclaim path, so to_reclaim could be >0 still
+ * because we may have only satisified the priority tickets and still
+ * left non priority tickets on the list. We would then have
+ * to_reclaim but ->bytes == 0.
+ */
+ if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- flush_state = 0;
- do {
+ while (flush_state < states_nr) {
+ spin_unlock(&space_info->lock);
flush_space(fs_info, space_info, to_reclaim, states[flush_state],
false);
flush_state++;
@@ -1280,23 +1287,49 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- } while (flush_state < states_nr);
+ }
+
+ /* Attempt to steal from the global rsv if we can. */
+ if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ }
+
+ /*
+ * We must run try_granting_tickets here because we could be a large
+ * ticket in front of a smaller ticket that can now be satisfied with
+ * the available space.
+ */
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
+ spin_lock(&space_info->lock);
+
+ /* We could have been granted before we got here. */
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
while (!space_info->full) {
+ spin_unlock(&space_info->lock);
flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
}
+
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
@@ -1378,25 +1411,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
break;
}
- spin_lock(&space_info->lock);
ret = ticket->error;
- if (ticket->bytes || ticket->error) {
- /*
- * We were a priority ticket, so we need to delete ourselves
- * from the list. Because we could have other priority tickets
- * behind us that require less space, run
- * btrfs_try_granting_tickets() to see if their reservations can
- * now be made.
- */
- if (!list_empty(&ticket->list)) {
- remove_ticket(space_info, ticket);
- btrfs_try_granting_tickets(fs_info, space_info);
- }
-
- if (!ret)
- ret = -ENOSPC;
- }
- spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
/*
* Check that we can't have an error set if the reservation succeeded,
@@ -1438,6 +1453,12 @@ static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
space_info->clamp = min(space_info->clamp + 1, 8);
}
+static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_EVICT);
+}
+
/**
* Try to reserve bytes from the block_rsv's space
*
@@ -1511,7 +1532,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
- ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ ticket.steal = can_steal(flush);
if (trace_btrfs_reserve_ticket_enabled())
start_ns = ktime_get_ns();
@@ -1567,7 +1588,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
/**
* Trye to reserve metadata bytes from the block_rsv's space
*
- * @root: the root we're allocating for
+ * @fs_info: the filesystem
* @block_rsv: block_rsv we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
@@ -1579,22 +1600,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
- if (ret == -ENOSPC &&
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
- if (block_rsv != global_rsv &&
- !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
- ret = 0;
- }
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index cb5056472e79..d841fed73492 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -123,7 +123,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a1c54a2c787c..0ec09fe01be6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
new_pool_size);
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f9eff3b0f77c..beb7f72d50b8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1537,6 +1537,16 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid);
+}
+BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show);
+
static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1572,6 +1582,7 @@ BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
*/
static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, error_stats),
+ BTRFS_ATTR_PTR(devid, fsid),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 3a4099a2bf05..d8e56edd6991 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -204,6 +204,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
+ btrfs_global_root_delete(root);
btrfs_put_root(root);
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 2a95f7224e18..51a8b075c259 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
struct btrfs_path *path = NULL;
struct btrfs_root *root = NULL;
struct extent_buffer *eb;
- struct btrfs_item *item;
char *value = "mary had a little lamb";
char *split1 = "mary had a little";
char *split2 = " lamb";
@@ -61,7 +60,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.offset = 0;
btrfs_setup_item_for_insert(root, path, &key, value_len);
- item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
@@ -90,8 +88,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(0);
- if (btrfs_item_size(eb, item) != strlen(split1)) {
+ if (btrfs_item_size(eb, 0) != strlen(split1)) {
test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
@@ -115,8 +112,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(1);
- if (btrfs_item_size(eb, item) != strlen(split2)) {
+ if (btrfs_item_size(eb, 1) != strlen(split2)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
@@ -147,8 +143,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(0);
- if (btrfs_item_size(eb, item) != strlen(split3)) {
+ if (btrfs_item_size(eb, 0) != strlen(split3)) {
test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
@@ -171,8 +166,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(1);
- if (btrfs_item_size(eb, item) != strlen(split4)) {
+ if (btrfs_item_size(eb, 1) != strlen(split4)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
@@ -195,8 +189,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(2);
- if (btrfs_item_size(eb, item) != strlen(split2)) {
+ if (btrfs_item_size(eb, 2) != strlen(split2)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index c2e72e7a8ff0..a232b15b8021 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -56,6 +56,54 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
return count;
}
+#define STATE_FLAG_STR_LEN 256
+
+#define PRINT_ONE_FLAG(state, dest, cur, name) \
+({ \
+ if (state->state & EXTENT_##name) \
+ cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \
+ "%s" #name, cur == 0 ? "" : "|"); \
+})
+
+static void extent_flag_to_str(const struct extent_state *state, char *dest)
+{
+ int cur = 0;
+
+ dest[0] = 0;
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY);
+ PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
+ PRINT_ONE_FLAG(state, dest, cur, LOCKED);
+ PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
+ PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
+ PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
+ PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
+ PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
+ PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
+ PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
+ PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
+}
+
+static void dump_extent_io_tree(const struct extent_io_tree *tree)
+{
+ struct rb_node *node;
+ char flags_str[STATE_FLAG_STR_LEN];
+
+ node = rb_first(&tree->state);
+ test_msg("io tree content:");
+ while (node) {
+ struct extent_state *state;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ extent_flag_to_str(state, flags_str);
+ test_msg(" start=%llu len=%llu flags=%s", state->start,
+ state->end + 1 - state->start, flags_str);
+ node = rb_next(node);
+ }
+}
+
static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
@@ -258,6 +306,8 @@ static int test_find_delalloc(u32 sectorsize)
}
ret = 0;
out_bits:
+ if (ret)
+ dump_extent_io_tree(tmp);
clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
@@ -534,6 +584,8 @@ static int test_find_first_clear_extent_bit(void)
ret = 0;
out:
+ if (ret)
+ dump_extent_io_tree(&tree);
clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
return ret;
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 8f05c1eb833f..5930cdcae5cb 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -824,6 +824,184 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
return 0;
}
+static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return true;
+}
+
+static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
+{
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .use_bitmap = bytes_index_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ u64 offset, max_extent_size, bytes;
+ int ret, i;
+
+ test_msg("running bytes index tests");
+
+ /* First just validate that it does everything in order. */
+ offset = 0;
+ for (i = 0; i < 10; i++) {
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+ offset += bytes + sectorsize;
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps do the correct thing. */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ for (i = 0; i < 2; i++) {
+ offset = i * BITS_PER_BITMAP * sectorsize;
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps with different ->max_extent_size. */
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
+
+ ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+
+ offset = BITS_PER_BITMAP * sectorsize;
+ ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap_entry");
+ return ret;
+ }
+
+ /*
+ * Now set a bunch of sectorsize extents in the first entry so it's
+ * ->bytes is large.
+ */
+ for (i = 2; i < 20; i += 2) {
+ offset = sectorsize * i;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error populating sparse bitmap %d", ret);
+ return ret;
+ }
+ }
+
+ /*
+ * Now set a contiguous extent in the second bitmap so its
+ * ->max_extent_size is larger than the first bitmaps.
+ */
+ offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding contiguous extent %d", ret);
+ return ret;
+ }
+
+ /*
+ * Since we don't set ->max_extent_size unless we search everything
+ * should be indexed on bytes.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (10 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
+ 0, &max_extent_size);
+ if (offset != 0) {
+ test_err("found space to alloc even though we don't have enough space");
+ return -EINVAL;
+ }
+
+ if (max_extent_size != (2 * sectorsize)) {
+ test_err("got the wrong max_extent size %llu expected %llu",
+ max_extent_size, (unsigned long long)(2 * sectorsize));
+ return -EINVAL;
+ }
+
+ /*
+ * The search should have re-arranged the bytes index to use the
+ * ->max_extent_size, validate it's now what we expect it to be.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (2 * sectorsize)) {
+ test_err("error, the bytes index wasn't recalculated properly");
+ return -EINVAL;
+ }
+
+ /* Add another sectorsize to re-arrange the tree back to ->bytes. */
+ offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding extent to the sparse entry %d", ret);
+ return ret;
+ }
+
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (11 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ /*
+ * Now make sure we find our correct entry after searching that will
+ * result in a re-arranging of the tree.
+ */
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
+ 0, &max_extent_size);
+ if (offset != (BITS_PER_BITMAP * sectorsize)) {
+ test_err("error, found %llu instead of %llu for our alloc",
+ offset,
+ (unsigned long long)(BITS_PER_BITMAP * sectorsize));
+ return -EINVAL;
+ }
+
+ cache->free_space_ctl->op = orig_free_space_ops;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ return 0;
+}
+
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
@@ -858,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
goto out;
}
- root->fs_info->extent_root = root;
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
ret = test_extents(cache);
if (ret)
@@ -871,6 +1052,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
goto out;
ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
+ if (ret)
+ goto out;
+ ret = test_bytes_index(cache, sectorsize);
out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 2c783d2f5228..13734ed43bfc 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -446,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
- root->fs_info->free_space_root = root;
+ root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 19ba7d5b7d8f..eee1e4459541 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -455,7 +455,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
}
/* We are using this root as our extent root */
- root->fs_info->extent_root = root;
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
/*
* Some of the paths we test assume we have a filled out fs_info, so we
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1c3a1189c0bd..03de89b45f27 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -162,7 +162,17 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *root, *tmp;
struct btrfs_caching_control *caching_ctl, *next;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+
down_write(&fs_info->commit_root_sem);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ fs_info->last_reloc_trans = trans->transid;
+
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
dirty_list) {
list_del_init(&root->dirty_list);
@@ -413,7 +423,6 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
- WARN_ON(root == fs_info->extent_root);
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -628,7 +637,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
if (ret)
goto reserve_fail;
if (delayed_refs_bytes) {
@@ -692,7 +701,6 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->root = root;
refcount_set(&h->use_count, 1);
h->fs_info = root->fs_info;
@@ -1236,6 +1244,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
struct extent_buffer *eb;
int ret;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb, BTRFS_NESTING_COW);
@@ -1267,9 +1281,8 @@ again:
root = list_entry(next, struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
- if (root != fs_info->extent_root)
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
@@ -1299,9 +1312,6 @@ again:
if (!list_empty(&fs_info->dirty_cowonly_roots))
goto again;
- list_add_tail(&fs_info->extent_root->dirty_list,
- &trans->transaction->switch_commits);
-
/* Update dev-replace pointer once everything is committed */
fs_info->dev_replace.committed_cursor_left =
fs_info->dev_replace.cursor_left_last_write_of_item;
@@ -1327,7 +1337,8 @@ void btrfs_add_dead_root(struct btrfs_root *root)
}
/*
- * update all the cowonly tree roots on disk
+ * Update each subvolume root and its relocation root, if it exists, in the tree
+ * of tree roots. Also free log roots if they exist.
*/
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
@@ -1336,6 +1347,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
int i;
int ret;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
@@ -1348,6 +1365,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *root = gang[i];
int ret2;
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
@@ -1472,12 +1497,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
- /*
- * We are going to commit transaction, see btrfs_commit_transaction()
- * comment for reason locking tree_log_mutex
- */
- mutex_lock(&fs_info->tree_log_mutex);
-
ret = commit_fs_roots(trans);
if (ret)
goto out;
@@ -1513,8 +1532,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
"Error while writing out transaction for qgroup");
out:
- mutex_unlock(&fs_info->tree_log_mutex);
-
/*
* Force parent root to be updated, as we recorded it before so its
* last_trans == cur_transid.
@@ -1578,7 +1595,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
- pending->error = btrfs_block_rsv_add(root,
+ pending->error = btrfs_block_rsv_add(fs_info,
&pending->block_rsv,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
@@ -1861,50 +1878,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
return ret;
}
-/*
- * commit transactions asynchronously. once btrfs_commit_transaction_async
- * returns, any subsequent transaction will not be allowed to join.
- */
-struct btrfs_async_commit {
- struct btrfs_trans_handle *newtrans;
- struct work_struct work;
-};
-
-static void do_async_commit(struct work_struct *work)
-{
- struct btrfs_async_commit *ac =
- container_of(work, struct btrfs_async_commit, work);
-
- /*
- * We've got freeze protection passed with the transaction.
- * Tell lockdep about it.
- */
- if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
-
- current->journal_info = ac->newtrans;
-
- btrfs_commit_transaction(ac->newtrans);
- kfree(ac);
-}
-
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_async_commit *ac;
struct btrfs_transaction *cur_trans;
- ac = kmalloc(sizeof(*ac), GFP_NOFS);
- if (!ac)
- return -ENOMEM;
-
- INIT_WORK(&ac->work, do_async_commit);
- ac->newtrans = btrfs_join_transaction(trans->root);
- if (IS_ERR(ac->newtrans)) {
- int err = PTR_ERR(ac->newtrans);
- kfree(ac);
- return err;
- }
+ /* Kick the transaction kthread. */
+ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+ wake_up_process(fs_info->transaction_kthread);
/* take transaction reference */
cur_trans = trans->transaction;
@@ -1913,28 +1894,15 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
btrfs_end_transaction(trans);
/*
- * Tell lockdep we've released the freeze rwsem, since the
- * async commit thread will be the one to unlock it.
- */
- if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_release(fs_info->sb, SB_FREEZE_FS);
-
- schedule_work(&ac->work);
- /*
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
- if (current->journal_info == trans)
- current->journal_info = NULL;
-
btrfs_put_transaction(cur_trans);
- return 0;
}
-
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1986,7 +1954,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
- trace_btrfs_transaction_commit(trans->root);
+ trace_btrfs_transaction_commit(fs_info);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -2200,6 +2168,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
+ /*
+ * We've started the commit, clear the flag in case we were triggered to
+ * do an async commit but somebody else started before the transaction
+ * kthread could do the work.
+ */
+ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
goto scrub_continue;
@@ -2246,24 +2221,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
WARN_ON(cur_trans != trans->transaction);
- /* btrfs_commit_tree_roots is responsible for getting the
- * various roots consistent with each other. Every pointer
- * in the tree of tree roots has to point to the most up to date
- * root for every subvolume and other tree. So, we have to keep
- * the tree logging code from jumping in and changing any
- * of the trees.
- *
- * At this point in the commit, there can't be any tree-log
- * writers, but a little lower down we drop the trans mutex
- * and let new people in. By holding the tree_log_mutex
- * from now until after the super is written, we avoid races
- * with the tree-log code.
- */
- mutex_lock(&fs_info->tree_log_mutex);
-
ret = commit_fs_roots(trans);
if (ret)
- goto unlock_tree_log;
+ goto unlock_reloc;
/*
* Since the transaction is done, we can apply the pending changes
@@ -2282,11 +2242,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
ret = btrfs_qgroup_account_extents(trans);
if (ret < 0)
- goto unlock_tree_log;
+ goto unlock_reloc;
ret = commit_cowonly_roots(trans);
if (ret)
- goto unlock_tree_log;
+ goto unlock_reloc;
/*
* The tasks which save the space cache and inode cache may also
@@ -2294,7 +2254,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- goto unlock_tree_log;
+ goto unlock_reloc;
}
cur_trans = fs_info->running_transaction;
@@ -2327,6 +2287,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_trans_release_chunk_metadata(trans);
+ /*
+ * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
+ * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
+ * make sure that before we commit our superblock, no other task can
+ * start a new transaction and commit a log tree before we commit our
+ * superblock. Anyone trying to commit a log tree locks this mutex before
+ * writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
fs_info->running_transaction = NULL;
@@ -2339,10 +2309,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
- /*
- * reloc_mutex has been unlocked, tree_log_mutex is still held
- * but we can't jump to unlock_tree_log causing double unlock
- */
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@@ -2393,7 +2359,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(fs_info->sb);
- trace_btrfs_transaction_commit(trans->root);
+ trace_btrfs_transaction_commit(fs_info);
btrfs_scrub_continue(fs_info);
@@ -2404,8 +2370,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
-unlock_tree_log:
- mutex_unlock(&fs_info->tree_log_mutex);
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
scrub_continue:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ba45065f9451..1852ed9de7fd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -135,7 +135,6 @@ struct btrfs_trans_handle {
bool removing_chunk;
bool reloc_reserved;
bool in_fsync;
- struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
};
@@ -217,7 +216,7 @@ void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7733e8ac0a69..72e1c942197d 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -202,7 +202,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_file_extent_item *fi;
u32 sectorsize = fs_info->sectorsize;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u64 extent_end;
if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
@@ -354,17 +354,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
key->offset, sectorsize);
return -EUCLEAN;
}
- if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
+ if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
generic_err(leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
- btrfs_item_size_nr(leaf, slot), csumsize);
+ btrfs_item_size(leaf, slot), csumsize);
return -EUCLEAN;
}
if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
u64 prev_csum_end;
u32 prev_item_size;
- prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+ prev_item_size = btrfs_item_size(leaf, slot - 1);
prev_csum_end = (prev_item_size / csumsize) * sectorsize;
prev_csum_end += prev_key->offset;
if (unlikely(prev_csum_end > key->offset)) {
@@ -483,7 +483,7 @@ static int check_dir_item(struct extent_buffer *leaf,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dir_item *di;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u32 cur = 0;
if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
@@ -640,7 +640,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_block_group_item bgi;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u64 flags;
u64 type;
@@ -912,10 +912,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
{
int num_stripes;
- if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(struct btrfs_chunk),
BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
@@ -927,10 +927,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
goto out;
if (unlikely(btrfs_chunk_item_size(num_stripes) !=
- btrfs_item_size_nr(leaf, slot))) {
+ btrfs_item_size(leaf, slot))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
btrfs_chunk_item_size(num_stripes));
return -EUCLEAN;
}
@@ -1095,12 +1095,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (unlikely(ret < 0))
return ret;
- if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
- btrfs_item_size_nr(leaf, slot) !=
+ if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size(leaf, slot) !=
btrfs_legacy_root_item_size())) {
generic_err(leaf, slot,
"invalid root item size, have %u expect %zu or %u",
- btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_item_size(leaf, slot), sizeof(ri),
btrfs_legacy_root_item_size());
return -EUCLEAN;
}
@@ -1111,7 +1111,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
* And since we allow geneartion_v2 as 0, it will still pass the check.
*/
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- btrfs_item_size_nr(leaf, slot));
+ btrfs_item_size(leaf, slot));
/* Generation related */
if (unlikely(btrfs_root_generation(&ri) >
@@ -1208,7 +1208,7 @@ static int check_extent_item(struct extent_buffer *leaf,
bool is_tree_block = false;
unsigned long ptr; /* Current pointer inside inline refs */
unsigned long end; /* Extent item end */
- const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 item_size = btrfs_item_size(leaf, slot);
u64 flags;
u64 generation;
u64 total_refs; /* Total refs in btrfs_extent_item */
@@ -1432,10 +1432,10 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
if (key->type == BTRFS_SHARED_DATA_REF_KEY)
expect_item_size = sizeof(struct btrfs_shared_data_ref);
- if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
+ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
"invalid item size, have %u expect %u for key type %u",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
expect_item_size, key->type);
return -EUCLEAN;
}
@@ -1460,12 +1460,12 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
{
struct btrfs_extent_data_ref *dref;
unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
- const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
+ const unsigned long end = ptr + btrfs_item_size(leaf, slot);
- if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
+ if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
generic_err(leaf, slot,
"invalid item size, have %u expect aligned to %zu for key type %u",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(*dref), key->type);
return -EUCLEAN;
}
@@ -1507,16 +1507,16 @@ static int check_inode_ref(struct extent_buffer *leaf,
if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
- if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
+ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
}
ptr = btrfs_item_ptr_offset(leaf, slot);
- end = ptr + btrfs_item_size_nr(leaf, slot);
+ end = ptr + btrfs_item_size(leaf, slot);
while (ptr < end) {
u16 namelen;
@@ -1689,12 +1689,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
if (slot == 0)
item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
else
- item_end_expected = btrfs_item_offset_nr(leaf,
+ item_end_expected = btrfs_item_offset(leaf,
slot - 1);
- if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
+ if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
generic_err(leaf, slot,
"unexpected item end, have %u expect %u",
- btrfs_item_end_nr(leaf, slot),
+ btrfs_item_data_end(leaf, slot),
item_end_expected);
return -EUCLEAN;
}
@@ -1704,11 +1704,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (unlikely(btrfs_item_end_nr(leaf, slot) >
+ if (unlikely(btrfs_item_data_end(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
- btrfs_item_end_nr(leaf, slot),
+ btrfs_item_data_end(leaf, slot),
BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 7c45d960b53c..b6cf39f4e7e4 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -27,14 +27,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int next_key_ret = 0;
u64 last_ret = 0;
- if (root->fs_info->extent_root == root) {
- /*
- * there's recursion here right now in the tree locking,
- * we can't defrag the extent root without deadlock
- */
- goto out;
- }
-
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ab33caf016f..c1ddbe800897 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,6 +20,7 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "inode-item.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -386,7 +387,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
overwrite_root = 1;
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
/* Our caller must have done a search for the key for us. */
@@ -409,7 +410,7 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans,
if (ret == 0) {
char *src_copy;
char *dst_copy;
- u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+ u32 dst_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (dst_size != item_size)
goto insert;
@@ -503,7 +504,7 @@ insert:
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
- found_size = btrfs_item_size_nr(path->nodes[0],
+ found_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
btrfs_truncate_item(path, item_size, 1);
@@ -872,17 +873,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
+ csum_root = btrfs_csum_root(fs_info,
+ sums->bytenr);
if (!ret)
- ret = btrfs_del_csums(trans,
- fs_info->csum_root,
+ ret = btrfs_del_csums(trans, csum_root,
sums->bytenr,
sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
- fs_info->csum_root, sums);
+ csum_root,
+ sums);
list_del(&sums->list);
kfree(sums);
}
@@ -1096,7 +1101,7 @@ again:
* otherwise they must be unlinked as a conflict
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
victim_ref = (struct btrfs_inode_ref *)ptr;
victim_name_len = btrfs_inode_ref_name_len(leaf,
@@ -1155,7 +1160,7 @@ again:
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
@@ -1181,6 +1186,7 @@ again:
parent_objectid, victim_name,
victim_name_len);
if (ret < 0) {
+ kfree(victim_name);
return ret;
} else if (!ret) {
ret = -ENOENT;
@@ -1317,7 +1323,7 @@ again:
eb = path->nodes[0];
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
char *name = NULL;
int namelen;
@@ -1503,7 +1509,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
int ref_struct_size;
ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size(eb, slot);
if (key->type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *r;
@@ -1677,7 +1683,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
break;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
@@ -1731,7 +1737,7 @@ process_slot:
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ ptr_end = ptr + btrfs_item_size(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
@@ -1949,6 +1955,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
return ret;
}
+static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *dst_di,
+ const struct btrfs_key *log_key,
+ u8 log_type,
+ bool exists)
+{
+ struct btrfs_key found_key;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* The existing dentry points to the same inode, don't delete it. */
+ if (found_key.objectid == log_key->objectid &&
+ found_key.type == log_key->type &&
+ found_key.offset == log_key->offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+ return 1;
+
+ /*
+ * Don't drop the conflicting directory entry if the inode for the new
+ * entry doesn't exist.
+ */
+ if (!exists)
+ return 0;
+
+ return drop_one_dir_item(trans, path, dir, dst_di);
+}
+
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
@@ -1974,14 +2008,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
{
char *name;
int name_len;
- struct btrfs_dir_item *dst_di;
- struct btrfs_key found_key;
+ struct btrfs_dir_item *dir_dst_di;
+ struct btrfs_dir_item *index_dst_di;
+ bool dir_dst_matches = false;
+ bool index_dst_matches = false;
struct btrfs_key log_key;
+ struct btrfs_key search_key;
struct inode *dir;
u8 log_type;
bool exists;
int ret;
- bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ bool update_size = true;
bool name_added = false;
dir = read_one_inode(root, key->objectid);
@@ -2007,76 +2044,53 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
exists = (ret == 0);
ret = 0;
- if (key->type == BTRFS_DIR_ITEM_KEY) {
- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
- } else if (key->type == BTRFS_DIR_INDEX_KEY) {
- dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid,
- key->offset, name,
- name_len, 1);
- } else {
- /* Corruption */
- ret = -EINVAL;
- goto out;
- }
-
- if (IS_ERR(dst_di)) {
- ret = PTR_ERR(dst_di);
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ name, name_len, 1);
+ if (IS_ERR(dir_dst_di)) {
+ ret = PTR_ERR(dir_dst_di);
goto out;
- } else if (!dst_di) {
- /* we need a sequence number to insert, so we only
- * do inserts for the BTRFS_DIR_INDEX_KEY types
- */
- if (key->type != BTRFS_DIR_INDEX_KEY)
+ } else if (dir_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ dir_dst_di, &log_key, log_type,
+ exists);
+ if (ret < 0)
goto out;
- goto insert;
+ dir_dst_matches = (ret == 1);
}
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
- /* the existing item matches the logged item */
- if (found_key.objectid == log_key.objectid &&
- found_key.type == log_key.type &&
- found_key.offset == log_key.offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
- update_size = false;
+ btrfs_release_path(path);
+
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid, key->offset,
+ name, name_len, 1);
+ if (IS_ERR(index_dst_di)) {
+ ret = PTR_ERR(index_dst_di);
goto out;
+ } else if (index_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ index_dst_di, &log_key,
+ log_type, exists);
+ if (ret < 0)
+ goto out;
+ index_dst_matches = (ret == 1);
}
- /*
- * don't drop the conflicting directory entry if the inode
- * for the new entry doesn't exist
- */
- if (!exists)
- goto out;
+ btrfs_release_path(path);
- ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
- if (ret)
+ if (dir_dst_matches && index_dst_matches) {
+ ret = 0;
+ update_size = false;
goto out;
-
- if (key->type == BTRFS_DIR_INDEX_KEY)
- goto insert;
-out:
- btrfs_release_path(path);
- if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
- kfree(name);
- iput(dir);
- if (!ret && name_added)
- ret = 1;
- return ret;
-insert:
/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_REF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -2086,10 +2100,10 @@ insert:
goto out;
}
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_EXTREF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
name_len);
if (ret < 0) {
goto out;
@@ -2108,87 +2122,76 @@ insert:
name_added = true;
update_size = false;
ret = 0;
- goto out;
+
+out:
+ if (!ret && update_size) {
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ }
+ kfree(name);
+ iput(dir);
+ if (!ret && name_added)
+ ret = 1;
+ return ret;
}
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- int ret = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ int ret;
struct btrfs_dir_item *di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- struct btrfs_path *fixup_path = NULL;
-
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret < 0)
- break;
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
- /*
- * If this entry refers to a non-directory (directories can not
- * have a link count > 1) and it was added in the transaction
- * that was not committed, make sure we fixup the link count of
- * the inode it the entry points to. Otherwise something like
- * the following would result in a directory pointing to an
- * inode with a wrong link that does not account for this dir
- * entry:
- *
- * mkdir testdir
- * touch testdir/foo
- * touch testdir/bar
- * sync
- *
- * ln testdir/bar testdir/bar_link
- * ln testdir/foo testdir/foo_link
- * xfs_io -c "fsync" testdir/bar
- *
- * <power failure>
- *
- * mount fs, log replay happens
- *
- * File foo would remain with a link count of 1 when it has two
- * entries pointing to it in the directory testdir. This would
- * make it impossible to ever delete the parent directory has
- * it would result in stale dentries that can never be deleted.
- */
- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_key di_key;
+ /* We only log dir index keys, which only contain a single dir item. */
+ ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
- if (!fixup_path) {
- fixup_path = btrfs_alloc_path();
- if (!fixup_path) {
- ret = -ENOMEM;
- break;
- }
- }
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ if (ret < 0)
+ return ret;
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path,
- di_key.objectid);
- if (ret)
- break;
- }
- ret = 0;
+ /*
+ * If this entry refers to a non-directory (directories can not have a
+ * link count > 1) and it was added in the transaction that was not
+ * committed, make sure we fixup the link count of the inode the entry
+ * points to. Otherwise something like the following would result in a
+ * directory pointing to an inode with a wrong link that does not account
+ * for this dir entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two entries
+ * pointing to it in the directory testdir. This would make it impossible
+ * to ever delete the parent directory has it would result in stale
+ * dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_path *fixup_path;
+ struct btrfs_key di_key;
+
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path)
+ return -ENOMEM;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
+ btrfs_free_path(fixup_path);
}
- btrfs_free_path(fixup_path);
+
return ret;
}
@@ -2205,7 +2208,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
- u64 dirid, int key_type,
+ u64 dirid,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
@@ -2218,7 +2221,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
return 1;
key.objectid = dirid;
- key.type = key_type;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2232,7 +2235,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto next;
}
@@ -2259,7 +2262,7 @@ next:
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto out;
}
@@ -2290,95 +2293,82 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
int ret;
struct extent_buffer *eb;
int slot;
- u32 item_size;
struct btrfs_dir_item *di;
- struct btrfs_dir_item *log_di;
int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
char *name;
- struct inode *inode;
+ struct inode *inode = NULL;
struct btrfs_key location;
-again:
+ /*
+ * Currenly we only log dir index keys. Even if we replay a log created
+ * by an older kernel that logged both dir index and dir item keys, all
+ * we need to do is process the dir index keys, we (and our caller) can
+ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+ */
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
- goto out;
- }
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
- log_di = NULL;
- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
- log_di = btrfs_lookup_dir_item(trans, log, log_path,
- dir_key->objectid,
- name, name_len, 0);
- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
- log_di = btrfs_lookup_dir_index_item(trans, log,
- log_path,
- dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
- }
- if (!log_di) {
- btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
- btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- kfree(name);
- return -EIO;
- }
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ret = link_to_fixup_dir(trans, root,
- path, location.objectid);
- if (ret) {
- kfree(name);
- iput(inode);
- goto out;
- }
+ read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
- inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
- BTRFS_I(inode), name, name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(trans);
- kfree(name);
- iput(inode);
- if (ret)
- goto out;
+ if (log) {
+ struct btrfs_dir_item *log_di;
- /* there might still be more names under this key
- * check and repeat if required
- */
- ret = btrfs_search_slot(NULL, root, dir_key, path,
- 0, 0);
- if (ret == 0)
- goto again;
+ log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ } else if (log_di) {
+ /* The dentry exists in the log, we have nothing to do. */
ret = 0;
goto out;
- } else if (IS_ERR(log_di)) {
- kfree(name);
- return PTR_ERR(log_di);
}
- btrfs_release_path(log_path);
- kfree(name);
+ }
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
}
- ret = 0;
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ if (ret)
+ goto out;
+
+ inc_nlink(inode);
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ name_len);
+ if (ret)
+ goto out;
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto out;
+
+ /*
+ * Unlike dir item keys, dir index keys can only have one name (entry) in
+ * them, as there are no key collisions since each key has a unique offset
+ * (an index number), so we're done.
+ */
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
+ kfree(name);
+ iput(inode);
return ret;
}
@@ -2421,7 +2411,7 @@ process_leaf:
}
di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size_nr(path->nodes[0], i);
+ total_size = btrfs_item_size(path->nodes[0], i);
cur = 0;
while (cur < total_size) {
u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
@@ -2498,7 +2488,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
{
u64 range_start;
u64 range_end;
- int key_type = BTRFS_DIR_LOG_ITEM_KEY;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
@@ -2506,7 +2495,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct inode *dir;
dir_key.objectid = dirid;
- dir_key.type = BTRFS_DIR_ITEM_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
@@ -2520,14 +2509,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
btrfs_free_path(log_path);
return 0;
}
-again:
+
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid, key_type,
+ ret = find_dir_range(log, path, dirid,
&range_start, &range_end);
if (ret < 0)
goto out;
@@ -2554,8 +2543,10 @@ again:
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != dirid ||
- found_key.type != dir_key.type)
- goto next_type;
+ found_key.type != dir_key.type) {
+ ret = 0;
+ goto out;
+ }
if (found_key.offset > range_end)
break;
@@ -2574,15 +2565,7 @@ again:
break;
range_start = range_end + 1;
}
-
-next_type:
ret = 0;
- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
- key_type = BTRFS_DIR_LOG_INDEX_KEY;
- dir_key.type = BTRFS_DIR_INDEX_KEY;
- btrfs_release_path(path);
- goto again;
- }
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
@@ -2742,12 +2725,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
eb, i, &key);
if (ret)
break;
- } else if (key.type == BTRFS_DIR_ITEM_KEY) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
- if (ret)
- break;
}
+ /*
+ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+ * BTRFS_DIR_INDEX_KEY items which we use to derive the
+ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+ * older kernel with such keys, ignore them.
+ */
}
btrfs_free_path(path);
return ret;
@@ -2908,6 +2892,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
path->nodes[*level]->len);
if (ret)
return ret;
+ btrfs_redirty_list_add(trans->transaction,
+ next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -2988,6 +2974,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
next->start, next->len);
if (ret)
goto out;
+ btrfs_redirty_list_add(trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -3438,8 +3425,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
extent_io_tree_release(&log->log_csum_range);
- if (trans && log->node)
- btrfs_redirty_list_add(trans->transaction, log->node);
btrfs_put_root(log);
}
@@ -3549,20 +3534,10 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
goto out_unlock;
}
- di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
- name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- if (ret) {
- err = ret;
- goto fail;
- }
- }
- btrfs_release_path(path);
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
index, name, name_len, -1);
if (IS_ERR(di)) {
@@ -3626,7 +3601,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- int key_type, u64 dirid,
+ u64 dirid,
u64 first_offset, u64 last_offset)
{
int ret;
@@ -3635,10 +3610,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
key.objectid = dirid;
key.offset = first_offset;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- key.type = BTRFS_DIR_LOG_ITEM_KEY;
- else
- key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
if (ret)
return ret;
@@ -3673,7 +3645,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
if (count == 1) {
btrfs_item_key_to_cpu(src, &key, start_slot);
- item_size = btrfs_item_size_nr(src, start_slot);
+ item_size = btrfs_item_size(src, start_slot);
batch.keys = &key;
batch.data_sizes = &item_size;
batch.total_data_size = item_size;
@@ -3696,7 +3668,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
const int slot = start_slot + i;
btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
- ins_sizes[i] = btrfs_item_size_nr(src, slot);
+ ins_sizes[i] = btrfs_item_size(src, slot);
batch.total_data_size += ins_sizes[i];
}
}
@@ -3730,7 +3702,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
- int key_type,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *log = inode->root->log_root;
@@ -3738,24 +3709,18 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
const int nritems = btrfs_header_nritems(src);
const u64 ino = btrfs_ino(inode);
const bool inode_logged_before = inode_logged(trans, inode);
- u64 last_logged_key_offset;
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
int i;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- last_logged_key_offset = inode->last_dir_item_offset;
- else
- last_logged_key_offset = inode->last_dir_index_offset;
-
for (i = path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
int ret;
btrfs_item_key_to_cpu(src, &key, i);
- if (key.objectid != ino || key.type != key_type) {
+ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
last_found = true;
break;
}
@@ -3804,7 +3769,7 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
* we logged is in the log tree, saving time and avoiding adding
* contention on the log tree.
*/
- if (key.offset > last_logged_key_offset)
+ if (key.offset > inode->last_dir_index_offset)
goto add_to_batch;
/*
* Check if the key was already logged before. If not we can add
@@ -3863,7 +3828,7 @@ add_to_batch:
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path, int key_type,
+ struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
@@ -3877,7 +3842,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
u64 ino = btrfs_ino(inode);
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = min_offset;
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
@@ -3886,9 +3851,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* we didn't find anything from this transaction, see if there
* is anything at all
*/
- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ if (ret != 0 || min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = (u64)-1;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -3896,7 +3862,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return ret;
}
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
@@ -3907,18 +3873,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_key tmp;
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
- if (key_type == tmp.type)
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
first_offset = max(min_offset, tmp.offset) + 1;
}
goto done;
}
/* go backward to find any previous key */
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (key_type == tmp.type) {
+ if (tmp.type == BTRFS_DIR_INDEX_KEY) {
first_offset = tmp.offset;
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
@@ -3949,8 +3915,7 @@ search:
* from our directory
*/
while (1) {
- ret = process_dir_items_leaf(trans, inode, path, dst_path,
- key_type, ctx);
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
if (ret != 0) {
if (ret < 0)
err = ret;
@@ -3971,11 +3936,12 @@ search:
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
- if (min_key.objectid != ino || min_key.type != key_type) {
+ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+ ctx->last_dir_item_offset = min_key.offset;
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&min_key);
@@ -4001,8 +3967,8 @@ done:
* insert the log range keys to indicate where the log
* is valid
*/
- ret = insert_dir_log_key(trans, log, path, key_type,
- ino, first_offset, last_offset);
+ ret = insert_dir_log_key(trans, log, path, ino, first_offset,
+ last_offset);
if (ret)
err = ret;
}
@@ -4030,35 +3996,28 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 min_key;
u64 max_key;
int ret;
- int key_type = BTRFS_DIR_ITEM_KEY;
/*
* If this is the first time we are being logged in the current
* transaction, or we were logged before but the inode was evicted and
- * reloaded later, in which case its logged_trans is 0, reset the values
- * of the last logged key offsets. Note that we don't use the helper
+ * reloaded later, in which case its logged_trans is 0, reset the value
+ * of the last logged key offset. Note that we don't use the helper
* function inode_logged() here - that is because the function returns
* true after an inode eviction, assuming the worst case as it can not
* know for sure if the inode was logged before. So we can not skip key
* searches in the case the inode was evicted, because it may not have
* been logged in this transaction and may have been logged in a past
- * transaction, so we need to reset the last dir item and index offsets
- * to (u64)-1.
+ * transaction, so we need to reset the last dir index offset to (u64)-1.
*/
- if (inode->logged_trans != trans->transid) {
- inode->last_dir_item_offset = (u64)-1;
+ if (inode->logged_trans != trans->transid)
inode->last_dir_index_offset = (u64)-1;
- }
-again:
+
min_key = 0;
max_key = 0;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- ctx->last_dir_item_offset = inode->last_dir_item_offset;
- else
- ctx->last_dir_item_offset = inode->last_dir_index_offset;
+ ctx->last_dir_item_offset = inode->last_dir_index_offset;
while (1) {
- ret = log_dir_items(trans, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -4067,13 +4026,8 @@ again:
min_key = max_key + 1;
}
- if (key_type == BTRFS_DIR_ITEM_KEY) {
- inode->last_dir_item_offset = ctx->last_dir_item_offset;
- key_type = BTRFS_DIR_INDEX_KEY;
- goto again;
- } else {
- inode->last_dir_index_offset = ctx->last_dir_item_offset;
- }
+ inode->last_dir_index_offset = ctx->last_dir_item_offset;
+
return 0;
}
@@ -4144,14 +4098,14 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
u64 new_size, u32 min_type)
{
- int ret;
-
- do {
- ret = btrfs_truncate_inode_items(trans, log_root, inode,
- new_size, min_type, NULL);
- } while (ret == -EAGAIN);
+ struct btrfs_truncate_control control = {
+ .new_size = new_size,
+ .ino = btrfs_ino(inode),
+ .min_type = min_type,
+ .skip_ref_updates = true,
+ };
- return ret;
+ return btrfs_truncate_inode_items(trans, log_root, &control);
}
static void fill_inode_item(struct btrfs_trans_handle *trans,
@@ -4347,7 +4301,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
batch.nr = nr;
for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+ ins_sizes[i] = btrfs_item_size(src, i + start_slot);
batch.total_data_size += ins_sizes[i];
btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
}
@@ -4391,6 +4345,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
found_type = btrfs_file_extent_type(src, extent);
if (found_type == BTRFS_FILE_EXTENT_REG) {
+ struct btrfs_root *csum_root;
u64 ds, dl, cs, cl;
ds = btrfs_file_extent_disk_bytenr(src,
extent);
@@ -4409,8 +4364,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
cl = dl;
}
- ret = btrfs_lookup_csums_range(
- fs_info->csum_root,
+ csum_root = btrfs_csum_root(fs_info, ds);
+ ret = btrfs_lookup_csums_range(csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
if (ret)
@@ -4462,6 +4417,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *csum_root;
u64 csum_offset;
u64 csum_len;
u64 mod_start = em->mod_start;
@@ -4542,7 +4498,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
+ csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
+ ret = btrfs_lookup_csums_range(csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
csum_len - 1, &ordered_sums, 0);
@@ -5163,7 +5120,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
struct btrfs_path *search_path;
char *name = NULL;
u32 name_len = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
@@ -5896,18 +5853,12 @@ struct btrfs_dir_list {
* link_to_fixup_dir());
*
* 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
- * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
* has a size that doesn't match the sum of the lengths of all the logged
- * names. This does not result in a problem because if a dir_item key is
- * logged but its matching dir_index key is not logged, at log replay time we
- * don't use it to replay the respective name (see replay_one_name()). On the
- * other hand if only the dir_index key ends up being logged, the respective
- * name is added to the fs/subvol tree with both the dir_item and dir_index
- * keys created (see replay_one_name()).
- * The directory's inode item with a wrong i_size is not a problem as well,
- * since we don't use it at log replay time to set the i_size in the inode
- * item of the fs/subvol tree (see overwrite_item()).
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * do_overwrite_item()).
*/
static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -5953,7 +5904,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
goto next_dir_inode;
min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_ITEM_KEY;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = 0;
again:
btrfs_release_path(path);
@@ -5978,7 +5929,7 @@ process_leaf:
btrfs_item_key_to_cpu(leaf, &min_key, i);
if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_ITEM_KEY)
+ min_key.type != BTRFS_DIR_INDEX_KEY)
goto next_dir_inode;
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
@@ -6090,7 +6041,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
@@ -6792,15 +6743,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* was previously logged, make sure the next log attempt on the directory
* is not skipped and logs the inode again. This is because the log may
* not currently be authoritative for a range including the old
- * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
- * sure after a log replay we do not end up with both the new and old
- * dentries around (in case the inode is a directory we would have a
- * directory with two hard links and 2 inode references for different
- * parents). The next log attempt of old_dir will happen at
- * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
- * below, because we have previously set inode->last_unlink_trans to the
- * current transaction ID, either here or at btrfs_record_unlink_dir() in
- * case inode is a directory.
+ * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
+ * do not end up with both the new and old dentries around (in case the
+ * inode is a directory we would have a directory with two hard links and
+ * 2 inode references for different parents). The next log attempt of
+ * old_dir will happen at btrfs_log_all_parents(), called through
+ * btrfs_log_inode_parent() below, because we have previously set
+ * inode->last_unlink_trans to the current transaction ID, either here or
+ * at btrfs_record_unlink_dir() in case the inode is a directory.
*/
if (old_dir)
old_dir->logged_trans = 0;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 74023c8a783f..b458452a1aaf 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
offset = btrfs_item_ptr_offset(eb, slot);
ret = -ENOENT;
@@ -125,7 +125,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
- offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+ offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
} else {
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
@@ -186,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
@@ -208,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
goto out;
}
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (item_size == sizeof(subid)) {
ret = btrfs_del_item(trans, uuid_root, path);
goto out;
@@ -331,7 +331,7 @@ again_search_slot:
goto skip;
offset = btrfs_item_ptr_offset(leaf, slot);
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info,
"uuid item with illegal size %lu!",
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 4968535dfff0..90eb5c2830a9 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -333,7 +333,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
if (key.objectid != btrfs_ino(inode) || key.type != key_type)
break;
- item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+ item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
if (copied > 0) {
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 61ac57bcbf1a..b07d382d53a8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -34,6 +34,10 @@
#include "discard.h"
#include "zoned.h"
+#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10 | \
+ BTRFS_BLOCK_GROUP_RAID56_MASK)
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -1162,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
ASSERT(list_empty(&device->dev_alloc_list));
ASSERT(list_empty(&device->post_commit_list));
- ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -1370,8 +1373,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
bytenr_orig = btrfs_sb_offset(0);
ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
- if (ret)
- return ERR_PTR(ret);
+ if (ret) {
+ device = ERR_PTR(ret);
+ goto error_bdev_put;
+ }
disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
if (IS_ERR(disk_super)) {
@@ -2144,8 +2149,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
ret = btrfs_shrink_device(device, 0);
- if (!ret)
- btrfs_reada_remove_dev(device);
if (ret)
goto error_undo;
@@ -2243,7 +2246,6 @@ out:
return ret;
error_undo:
- btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
@@ -2429,21 +2431,15 @@ struct btrfs_device *btrfs_find_device_by_devspec(
return device;
}
-/*
- * does all the dirty work required for changing file system's UUID.
- */
-static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
+static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = fs_info->super_copy;
- struct btrfs_device *device;
- u64 super_flags;
lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/*
* Private copy of the seed devices, anchored at
@@ -2451,7 +2447,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
- return PTR_ERR(seed_devices);
+ return seed_devices;
/*
* It's necessary to retain a copy of the original seed fs_devices in
@@ -2462,7 +2458,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
- return PTR_ERR(old_devices);
+ return old_devices;
}
list_add(&old_devices->fs_list, &fs_uuids);
@@ -2473,7 +2469,41 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
- mutex_lock(&fs_devices->device_list_mutex);
+ return seed_devices;
+}
+
+/*
+ * Splice seed devices into the sprout fs_devices.
+ * Generate a new fsid for the sprouted read-write filesystem.
+ */
+static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *seed_devices)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ /*
+ * We are updating the fsid, the thread leading to device_list_add()
+ * could race, so uuid_mutex is needed.
+ */
+ lockdep_assert_held(&uuid_mutex);
+
+ /*
+ * The threads listed below may traverse dev_list but can do that without
+ * device_list_mutex:
+ * - All device ops and balance - as we are in btrfs_exclop_start.
+ * - Various dev_list readers - are using RCU.
+ * - btrfs_ioctl_fitrim() - is using RCU.
+ *
+ * For-read threads as below are using device_list_mutex:
+ * - Readonly scrub btrfs_scrub_dev()
+ * - Readonly scrub btrfs_scrub_progress()
+ * - btrfs_get_dev_stats()
+ */
+ lockdep_assert_held(&fs_devices->device_list_mutex);
+
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
@@ -2489,13 +2519,10 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
- mutex_unlock(&fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
-
- return 0;
}
/*
@@ -2586,10 +2613,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct super_block *sb = fs_info->sb;
struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *seed_devices;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
- int seeding_dev = 0;
int ret = 0;
+ bool seeding_dev = false;
bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
@@ -2606,7 +2634,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
if (fs_devices->seeding) {
- seeding_dev = 1;
+ seeding_dev = true;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
locked = true;
@@ -2641,7 +2669,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->fs_info = fs_info;
device->bdev = bdev;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error_free_device;
@@ -2669,18 +2697,25 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
- ret = btrfs_prepare_sprout(fs_info);
- if (ret) {
+
+ /* GFP_KERNEL allocation must not be under device_list_mutex */
+ seed_devices = btrfs_init_sprout(fs_info);
+ if (IS_ERR(seed_devices)) {
+ ret = PTR_ERR(seed_devices);
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
+ }
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ if (seeding_dev) {
+ btrfs_setup_sprout(fs_info, seed_devices);
btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
device);
}
device->fs_devices = fs_devices;
- mutex_lock(&fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
@@ -2742,7 +2777,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
/*
* fs_devices now represents the newly sprouted filesystem and
- * its fsid has been changed by btrfs_prepare_sprout
+ * its fsid has been changed by btrfs_sprout_splice().
*/
btrfs_sysfs_update_sprout_fsid(fs_devices);
}
@@ -4355,8 +4390,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
- if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
btrfs_info(fs_info, "balance: paused");
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ }
/*
* Balance can be canceled by:
*
@@ -4432,6 +4469,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
/*
* A ro->rw remount sequence should continue with the paused balance
* regardless of who pauses it, system or the user as of now, so set
@@ -4500,7 +4541,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4641,7 +4682,7 @@ int btrfs_uuid_scan_kthread(void *data)
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (item_size < sizeof(root_item))
goto skip;
@@ -5502,7 +5543,6 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
struct btrfs_chunk *chunk;
@@ -5574,7 +5614,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
}
btrfs_set_stack_chunk_length(chunk, bg->length);
- btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
@@ -6312,7 +6352,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ /* Only stripe based profiles needs to check against stripe length. */
+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
u64 max_len = stripe_len - stripe_offset;
/*
@@ -6935,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
- atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, &dev->alloc_state,
IO_TREE_DEVICE_ALLOC_STATE, NULL);
@@ -7559,6 +7597,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
fs_info->fs_devices->total_rw_bytes = 0;
/*
+ * Lockdep complains about possible circular locking dependency between
+ * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
+ * used for freeze procection of a fs (struct super_block.s_writers),
+ * which we take when starting a transaction, and extent buffers of the
+ * chunk tree if we call read_one_dev() while holding a lock on an
+ * extent buffer of the chunk tree. Since we are mounting the filesystem
+ * and at this point there can't be any concurrent task modifying the
+ * chunk tree, to keep it simple, just skip locking on the chunk tree.
+ */
+ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+ path->skip_locking = 1;
+
+ /*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk
@@ -7583,10 +7634,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
goto error;
break;
}
- /*
- * The nodes on level 1 are not locked but we don't need to do
- * that during mount time as nothing else can access the tree
- */
node = path->nodes[1];
if (node) {
if (last_ra_node != node->start) {
@@ -7614,7 +7661,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* requirement for chunk allocation, see the comment on
* top of btrfs_chunk_alloc() for details.
*/
- ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
ret = read_one_chunk(&found_key, leaf, chunk);
if (ret)
@@ -7720,7 +7766,7 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
}
slot = path->slots[0];
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
@@ -7798,7 +7844,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
@@ -8288,23 +8334,26 @@ out:
return ret;
}
-int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
{
struct btrfs_block_group *cache;
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
/* Do not attempt to repair in degraded state */
if (btrfs_test_opt(fs_info, DEGRADED))
- return 0;
+ return true;
cache = btrfs_lookup_block_group(fs_info, logical);
if (!cache)
- return 0;
+ return true;
spin_lock(&cache->lock);
if (cache->relocating_repair) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
- return 0;
+ return true;
}
cache->relocating_repair = 1;
spin_unlock(&cache->lock);
@@ -8312,5 +8361,5 @@ int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
- return 0;
+ return true;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3b8130680749..005c9e2a491a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -123,13 +123,6 @@ struct btrfs_device {
/* per-device scrub information */
struct scrub_ctx *scrub_ctx;
- /* readahead state */
- atomic_t reada_in_flight;
- u64 reada_next;
- struct reada_zone *reada_curr_zone;
- struct radix_tree_root reada_zones;
- struct radix_tree_root reada_extents;
-
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid;
@@ -637,6 +630,6 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
-int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 2837b4c8424d..99abf41b89b9 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const int slot = path->slots[0];
struct extent_buffer *leaf = path->nodes[0];
const u16 old_data_len = btrfs_dir_data_len(leaf, di);
- const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 item_size = btrfs_item_size(leaf, slot);
const u32 data_size = sizeof(*di) + name_len + size;
- struct btrfs_item *item;
unsigned long data_ptr;
char *ptr;
@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_extend_item(path, data_size);
}
- item = btrfs_item_nr(slot);
ptr = btrfs_item_ptr(leaf, slot, char);
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ptr += btrfs_item_size(leaf, slot) - data_size;
di = (struct btrfs_dir_item *)ptr;
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
@@ -335,7 +333,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next_item;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
cur = 0;
while (cur < item_size) {
u16 name_len = btrfs_dir_name_len(leaf, di);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 67d932d70798..f559d517c7c4 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -5,6 +5,7 @@
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
#include <linux/atomic.h>
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
@@ -213,6 +214,8 @@ static int emulate_report_zones(struct btrfs_device *device, u64 pos,
static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
struct blk_zone *zones, unsigned int *nr_zones)
{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zno;
int ret;
if (!*nr_zones)
@@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
return 0;
}
+ /* Check cache */
+ if (zinfo->zone_cache) {
+ unsigned int i;
+
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ zno = pos >> zinfo->zone_size_shift;
+ /*
+ * We cannot report zones beyond the zone end. So, it is OK to
+ * cap *nr_zones to at the end.
+ */
+ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
+
+ for (i = 0; i < *nr_zones; i++) {
+ struct blk_zone *zone_info;
+
+ zone_info = &zinfo->zone_cache[zno + i];
+ if (!zone_info->len)
+ break;
+ }
+
+ if (i == *nr_zones) {
+ /* Cache hit on all the zones */
+ memcpy(zones, zinfo->zone_cache + zno,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+ return 0;
+ }
+ }
+
ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
copy_zone_info_cb, zones);
if (ret < 0) {
@@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
if (!ret)
return -EIO;
+ /* Populate cache */
+ if (zinfo->zone_cache)
+ memcpy(zinfo->zone_cache + zno, zones,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+
return 0;
}
@@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- ret = btrfs_get_dev_zone_info(device);
+ ret = btrfs_get_dev_zone_info(device, true);
if (ret)
break;
}
@@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
return ret;
}
-int btrfs_get_dev_zone_info(struct btrfs_device *device)
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
@@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return -ENOMEM;
+ device->zone_info = zone_info;
+
if (!bdev_is_zoned(bdev)) {
if (!fs_info->zone_size) {
ret = calculate_emulated_zone_size(fs_info);
@@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
goto out;
}
+ /*
+ * Enable zone cache only for a zoned device. On a non-zoned device, we
+ * fill the zone info with emulated CONVENTIONAL zones, so no need to
+ * use the cache.
+ */
+ if (populate_cache && bdev_is_zoned(device->bdev)) {
+ zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
+ zone_info->nr_zones);
+ if (!zone_info->zone_cache) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to allocate zone cache for %s",
+ rcu_str_deref(device->name));
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
/* Get zones type */
nactive = 0;
while (sector < nr_sectors) {
@@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
kfree(zones);
- device->zone_info = zone_info;
-
switch (bdev_zoned_model(bdev)) {
case BLK_ZONED_HM:
model = "host-managed zoned";
@@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
out:
kfree(zones);
out_free_zone_info:
- bitmap_free(zone_info->active_zones);
- bitmap_free(zone_info->empty_zones);
- bitmap_free(zone_info->seq_zones);
- kfree(zone_info);
- device->zone_info = NULL;
+ btrfs_destroy_dev_zone_info(device);
return ret;
}
@@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
bitmap_free(zone_info->active_zones);
bitmap_free(zone_info->seq_zones);
bitmap_free(zone_info->empty_zones);
+ vfree(zone_info->zone_cache);
kfree(zone_info);
device->zone_info = NULL;
}
@@ -1104,7 +1154,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
u64 *offset_ret)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -1119,6 +1169,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
key.type = 0;
key.offset = 0;
+ root = btrfs_extent_root(fs_info, key.objectid);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* We should not find the exact match */
if (!ret)
@@ -1586,29 +1637,19 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (!btrfs_is_zoned(fs_info))
return true;
- cache = *cache_ret;
+ cache = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!cache)
+ return true;
- if (cache && (eb->start < cache->start ||
- cache->start + cache->length <= eb->start)) {
+ if (cache->meta_write_pointer != eb->start) {
btrfs_put_block_group(cache);
cache = NULL;
- *cache_ret = NULL;
+ ret = false;
+ } else {
+ cache->meta_write_pointer = eb->start + eb->len;
}
- if (!cache)
- cache = btrfs_lookup_block_group(fs_info, eb->start);
-
- if (cache) {
- if (cache->meta_write_pointer != eb->start) {
- btrfs_put_block_group(cache);
- cache = NULL;
- ret = false;
- } else {
- cache->meta_write_pointer = eb->start + eb->len;
- }
-
- *cache_ret = cache;
- }
+ *cache_ret = cache;
return ret;
}
@@ -1860,6 +1901,7 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
@@ -1883,7 +1925,7 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
return ret;
}
-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
struct btrfs_device *device;
bool ret = false;
@@ -1892,8 +1934,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index
return true;
/* Non-single profiles are not supported yet */
- if (raid_index != BTRFS_RAID_SINGLE)
- return false;
+ ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
/* Check if there is a device with active zones left */
mutex_lock(&fs_devices->device_list_mutex);
@@ -1942,6 +1983,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
ASSERT(block_group->alloc_offset == block_group->zone_capacity);
ASSERT(block_group->free_space_ctl->free_space == 0);
btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
map = block_group->physical_map;
@@ -1973,3 +2015,21 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
fs_info->data_reloc_bg = 0;
spin_unlock(&fs_info->relocation_bg_lock);
}
+
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->zone_info) {
+ vfree(device->zone_info->zone_cache);
+ device->zone_info->zone_cache = NULL;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index e53ab7b96437..cbf016a7bb5d 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -8,6 +8,7 @@
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
+#include "btrfs_inode.h"
/*
* Block groups with more than this value (percents) of unusable space will be
@@ -28,6 +29,7 @@ struct btrfs_zoned_device_info {
unsigned long *seq_zones;
unsigned long *empty_zones;
unsigned long *active_zones;
+ struct blk_zone *zone_cache;
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};
@@ -35,7 +37,7 @@ struct btrfs_zoned_device_info {
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone);
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
-int btrfs_get_dev_zone_info(struct btrfs_device *device);
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
@@ -71,11 +73,11 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
int btrfs_zone_finish(struct btrfs_block_group *block_group);
-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
- int raid_index);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_i
return 0;
}
-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
+ bool populate_cache)
{
return 0;
}
@@ -222,7 +225,7 @@ static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
}
static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
- int raid_index)
+ u64 flags)
{
return true;
}
@@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -350,4 +354,20 @@ static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->treelog_bg_lock);
}
+static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ btrfs_inode_lock(&inode->vfs_inode, 0);
+}
+
+static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ btrfs_inode_unlock(&inode->vfs_inode, 0);
+}
+
#endif
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d463d89f5db8..146291be6263 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -117,7 +117,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
root = path.dentry;
ret = -EINVAL;
- if (mnt_user_ns(path.mnt) != &init_user_ns) {
+ if (is_idmapped_mnt(path.mnt)) {
pr_warn("File cache on idmapped mounts not supported");
goto error_unsupported;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b9460b6fb76f..c447fa2e2d1f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4350,7 +4350,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
- bool is_opened = false;
+ bool already_opened = false;
int i;
if (count == 1)
@@ -4358,19 +4358,19 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
- if (bits & (1 << i))
- ci->i_nr_by_mode[i] += count;
-
/*
- * If any of the mode ref is larger than 1,
+ * If any of the mode ref is larger than 0,
* that means it has been already opened by
* others. Just skip checking the PIN ref.
*/
- if (i && ci->i_nr_by_mode[i] > 1)
- is_opened = true;
+ if (i && ci->i_nr_by_mode[i])
+ already_opened = true;
+
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i] += count;
}
- if (!is_opened)
+ if (!already_opened)
percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 02a0a0fd9ccd..c138e8126286 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -605,13 +605,25 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
in.cap.flags = CEPH_CAP_FLAG_AUTH;
in.ctime = in.mtime = in.atime = iinfo.btime;
- in.mode = cpu_to_le32((u32)mode);
in.truncate_seq = cpu_to_le32(1);
in.truncate_size = cpu_to_le64(-1ULL);
in.xattr_version = cpu_to_le64(1);
in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
- in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
- dir->i_gid : current_fsgid()));
+ if (dir->i_mode & S_ISGID) {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
+
+ /* Directories always inherit the setgid bit. */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
+ !in_group_p(dir->i_gid) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, dir, CAP_FSETID))
+ mode &= ~S_ISGID;
+ } else {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+ }
+ in.mode = cpu_to_le32((u32)mode);
+
in.nlink = cpu_to_le32(1);
in.max_size = cpu_to_le64(lo->stripe_unit);
@@ -847,7 +859,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ssize_t ret;
u64 off = iocb->ki_pos;
u64 len = iov_iter_count(to);
- u64 i_size;
+ u64 i_size = i_size_read(inode);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 250aad330a10..c30eefc0ac19 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3683,7 +3683,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
char *path;
- int pathlen, err;
+ int pathlen = 0, err;
u64 pathbase;
u64 snap_follows;
@@ -3703,7 +3703,6 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
} else {
path = NULL;
- pathlen = 0;
pathbase = 0;
}
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 12bde7bfda86..23a1ed2fb769 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -393,26 +393,14 @@ static void cifs_put_swn_reg(struct cifs_swn_reg *swnreg)
static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const char *name, int state)
{
- int i;
-
switch (state) {
case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name);
- for (i = 0; i < swnreg->tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
+ cifs_ses_mark_for_reconnect(swnreg->tcon->ses);
break;
case CIFS_SWN_RESOURCE_STATE_AVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name);
- for (i = 0; i < swnreg->tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
+ cifs_ses_mark_for_reconnect(swnreg->tcon->ses);
break;
case CIFS_SWN_RESOURCE_STATE_UNKNOWN:
cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b50da1901ebd..9e5d9e192ef0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -152,5 +152,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.33"
+#define CIFS_VERSION "2.34"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f3073a62ce57..4f5a3e857df4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -599,6 +599,7 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
bool is_server_using_iface(struct TCP_Server_Info *server,
struct cifs_server_iface *iface);
bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
+void cifs_ses_mark_for_reconnect(struct cifs_ses *ses);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 82577a7a5bb1..1060164b984a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1271,10 +1271,8 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
- if (ctx->nosharesock) {
- server->nosharesock = true;
+ if (ctx->nosharesock)
return 0;
- }
/* this server does not share socket */
if (server->nosharesock)
@@ -1438,6 +1436,9 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
goto out_err;
}
+ if (ctx->nosharesock)
+ tcp_ses->nosharesock = true;
+
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
@@ -1452,8 +1453,10 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->max_in_flight = 0;
tcp_ses->credits = 1;
if (primary_server) {
+ spin_lock(&cifs_tcp_ses_lock);
++primary_server->srv_count;
tcp_ses->primary_server = primary_server;
+ spin_unlock(&cifs_tcp_ses_lock);
}
init_waitqueue_head(&tcp_ses->response_q);
init_waitqueue_head(&tcp_ses->request_q);
@@ -1559,6 +1562,10 @@ smbd_connected:
/* fscache server cookies are based on primary channel only */
if (!CIFS_SERVER_IS_CHAN(tcp_ses))
cifs_fscache_get_client_cookie(tcp_ses);
+#ifdef CONFIG_CIFS_FSCACHE
+ else
+ tcp_ses->fscache = tcp_ses->primary_server->fscache;
+#endif /* CONFIG_CIFS_FSCACHE */
/* queue echo request delayed work */
queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
@@ -3043,12 +3050,6 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
cifs_dbg(VFS, "read only mount of RW share\n");
/* no need to log a RW mount of a typical RW share */
}
- /*
- * The cookie is initialized from volume info returned above.
- * Inside cifs_fscache_get_super_cookie it checks
- * that we do not get super cookie twice.
- */
- cifs_fscache_get_super_cookie(tcon);
}
/*
@@ -3063,6 +3064,13 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
(cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
+ /*
+ * The cookie is initialized from volume info returned above.
+ * Inside cifs_fscache_get_super_cookie it checks
+ * that we do not get super cookie twice.
+ */
+ cifs_fscache_get_super_cookie(tcon);
+
out:
mnt_ctx->server = server;
mnt_ctx->ses = ses;
@@ -3423,6 +3431,7 @@ static int connect_dfs_root(struct mount_ctx *mnt_ctx, struct dfs_cache_tgt_list
*/
mount_put_conns(mnt_ctx);
mount_get_dfs_conns(mnt_ctx);
+ set_root_ses(mnt_ctx);
full_path = build_unc_path_to_root(ctx, cifs_sb, true);
if (IS_ERR(full_path))
@@ -4111,18 +4120,6 @@ cifs_prune_tlinks(struct work_struct *work)
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-static void mark_tcon_tcp_ses_for_reconnect(struct cifs_tcon *tcon)
-{
- int i;
-
- for (i = 0; i < tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
-}
-
/* Update dfs referral path of superblock */
static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb,
const char *target)
@@ -4299,7 +4296,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco
*/
if (rc && server->current_fullpath != server->origin_fullpath) {
server->current_fullpath = server->origin_fullpath;
- mark_tcon_tcp_ses_for_reconnect(tcon);
+ cifs_ses_mark_for_reconnect(tcon->ses);
}
dfs_cache_free_tgts(tl);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 5c1259d2eeac..e9b0fa2a9614 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1355,12 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach
}
cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- for (i = 0; i < tcon->ses->chan_count; i++) {
- spin_lock(&GlobalMid_Lock);
- if (tcon->ses->chans[i].server->tcpStatus != CifsExiting)
- tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&GlobalMid_Lock);
- }
+ cifs_ses_mark_for_reconnect(tcon->ses);
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 6a179ae753c1..e3ed25dc6f3f 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -435,6 +435,42 @@ out:
}
/*
+ * Remove duplicate path delimiters. Windows is supposed to do that
+ * but there are some bugs that prevent rename from working if there are
+ * multiple delimiters.
+ *
+ * Returns a sanitized duplicate of @path. The caller is responsible for
+ * cleaning up the original.
+ */
+#define IS_DELIM(c) ((c) == '/' || (c) == '\\')
+static char *sanitize_path(char *path)
+{
+ char *cursor1 = path, *cursor2 = path;
+
+ /* skip all prepended delimiters */
+ while (IS_DELIM(*cursor1))
+ cursor1++;
+
+ /* copy the first letter */
+ *cursor2 = *cursor1;
+
+ /* copy the remainder... */
+ while (*(cursor1++)) {
+ /* ... skipping all duplicated delimiters */
+ if (IS_DELIM(*cursor1) && IS_DELIM(*cursor2))
+ continue;
+ *(++cursor2) = *cursor1;
+ }
+
+ /* if the last character is a delimiter, skip it */
+ if (IS_DELIM(*(cursor2 - 1)))
+ cursor2--;
+
+ *(cursor2) = '\0';
+ return kstrdup(path, GFP_KERNEL);
+}
+
+/*
* Parse a devname into substrings and populate the ctx->UNC and ctx->prepath
* fields with the result. Returns 0 on success and an error otherwise
* (e.g. ENOMEM or EINVAL)
@@ -493,7 +529,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
if (!*pos)
return 0;
- ctx->prepath = kstrdup(pos, GFP_KERNEL);
+ ctx->prepath = sanitize_path(pos);
if (!ctx->prepath)
return -ENOMEM;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 7e409a38a2d7..003c5f1f4dfb 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -16,14 +16,7 @@
* Key layout of CIFS server cache index object
*/
struct cifs_server_key {
- struct {
- uint16_t family; /* address family */
- __be16 port; /* IP port */
- } hdr;
- union {
- struct in_addr ipv4_addr;
- struct in6_addr ipv6_addr;
- };
+ __u64 conn_id;
} __packed;
/*
@@ -31,42 +24,23 @@ struct cifs_server_key {
*/
void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
{
- const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr;
- const struct sockaddr_in *addr = (struct sockaddr_in *) sa;
- const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa;
struct cifs_server_key key;
- uint16_t key_len = sizeof(key.hdr);
-
- memset(&key, 0, sizeof(key));
/*
- * Should not be a problem as sin_family/sin6_family overlays
- * sa_family field
+ * Check if cookie was already initialized so don't reinitialize it.
+ * In the future, as we integrate with newer fscache features,
+ * we may want to instead add a check if cookie has changed
*/
- key.hdr.family = sa->sa_family;
- switch (sa->sa_family) {
- case AF_INET:
- key.hdr.port = addr->sin_port;
- key.ipv4_addr = addr->sin_addr;
- key_len += sizeof(key.ipv4_addr);
- break;
-
- case AF_INET6:
- key.hdr.port = addr6->sin6_port;
- key.ipv6_addr = addr6->sin6_addr;
- key_len += sizeof(key.ipv6_addr);
- break;
-
- default:
- cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
- server->fscache = NULL;
+ if (server->fscache)
return;
- }
+
+ memset(&key, 0, sizeof(key));
+ key.conn_id = server->conn_id;
server->fscache =
fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
&cifs_fscache_server_index_def,
- &key, key_len,
+ &key, sizeof(key),
NULL, 0,
server, 0, true);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
@@ -92,7 +66,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
* In the future, as we integrate with newer fscache features,
* we may want to instead add a check if cookie has changed
*/
- if (tcon->fscache == NULL)
+ if (tcon->fscache)
return;
sharename = extract_sharename(tcon->treeName);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82848412ad85..279622e4eb1c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1356,11 +1356,6 @@ iget_no_retry:
goto out;
}
-#ifdef CONFIG_CIFS_FSCACHE
- /* populate tcon->resource_id */
- tcon->resource_id = CIFS_I(inode)->uniqueid;
-#endif
-
if (rc && tcon->pipe) {
cifs_dbg(FYI, "ipc connection - fake read inode\n");
spin_lock(&inode->i_lock);
@@ -1375,7 +1370,6 @@ iget_no_retry:
iget_failed(inode);
inode = ERR_PTR(rc);
}
-
out:
kfree(path);
free_xid(xid);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 2c10b186ed6e..035dc3e245dc 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -95,9 +95,9 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
}
if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname);
ses->chan_max = 1;
spin_unlock(&ses->chan_lock);
+ cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname);
return 0;
}
spin_unlock(&ses->chan_lock);
@@ -222,6 +222,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
/* Auth */
ctx.domainauto = ses->domainAuto;
ctx.domainname = ses->domainName;
+ ctx.server_hostname = ses->server->hostname;
ctx.username = ses->user_name;
ctx.password = ses->password;
ctx.sectype = ses->sectype;
@@ -318,6 +319,19 @@ out:
return rc;
}
+/* Mark all session channels for reconnect */
+void cifs_ses_mark_for_reconnect(struct cifs_ses *ses)
+{
+ int i;
+
+ for (i = 0; i < ses->chan_count; i++) {
+ spin_lock(&GlobalMid_Lock);
+ if (ses->chans[i].server->tcpStatus != CifsExiting)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&GlobalMid_Lock);
+ }
+}
+
static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
{
__u32 capabilities = 0;
@@ -576,8 +590,8 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
{
unsigned int tioffset; /* challenge message target info area */
unsigned int tilen; /* challenge message target info area length */
-
CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
+ __u32 server_flags;
if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len);
@@ -595,12 +609,37 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
return -EINVAL;
}
+ server_flags = le32_to_cpu(pblob->NegotiateFlags);
+ cifs_dbg(FYI, "%s: negotiate=0x%08x challenge=0x%08x\n", __func__,
+ ses->ntlmssp->client_flags, server_flags);
+
+ if ((ses->ntlmssp->client_flags & (NTLMSSP_NEGOTIATE_SEAL | NTLMSSP_NEGOTIATE_SIGN)) &&
+ (!(server_flags & NTLMSSP_NEGOTIATE_56) && !(server_flags & NTLMSSP_NEGOTIATE_128))) {
+ cifs_dbg(VFS, "%s: requested signing/encryption but server did not return either 56-bit or 128-bit session key size\n",
+ __func__);
+ return -EINVAL;
+ }
+ if (!(server_flags & NTLMSSP_NEGOTIATE_NTLM) && !(server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) {
+ cifs_dbg(VFS, "%s: server does not seem to support either NTLMv1 or NTLMv2\n", __func__);
+ return -EINVAL;
+ }
+ if (ses->server->sign && !(server_flags & NTLMSSP_NEGOTIATE_SIGN)) {
+ cifs_dbg(VFS, "%s: forced packet signing but server does not seem to support it\n",
+ __func__);
+ return -EOPNOTSUPP;
+ }
+ if ((ses->ntlmssp->client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+ !(server_flags & NTLMSSP_NEGOTIATE_KEY_XCH))
+ pr_warn_once("%s: authentication has been weakened as server does not support key exchange\n",
+ __func__);
+
+ ses->ntlmssp->server_flags = server_flags;
+
memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
- /* BB we could decode pblob->NegotiateFlags; some may be useful */
/* In particular we can examine sign flags */
/* BB spec says that if AvId field of MsvAvTimestamp is populated then
we must set the MIC field of the AUTHENTICATE_MESSAGE */
- ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
+
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
if (tioffset > blob_len || tioffset + tilen > blob_len) {
@@ -707,13 +746,13 @@ int build_ntlmssp_negotiate_blob(unsigned char **pbuffer,
flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
- NTLMSSP_NEGOTIATE_SEAL;
- if (server->sign)
- flags |= NTLMSSP_NEGOTIATE_SIGN;
+ NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL |
+ NTLMSSP_NEGOTIATE_SIGN;
if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
tmp = *pbuffer + sizeof(NEGOTIATE_MESSAGE);
+ ses->ntlmssp->client_flags = flags;
sec_blob->NegotiateFlags = cpu_to_le32(flags);
/* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */
@@ -765,15 +804,8 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
sec_blob->MessageType = NtLmAuthenticate;
- flags = NTLMSSP_NEGOTIATE_56 |
- NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
- NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
- NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
- NTLMSSP_NEGOTIATE_SEAL | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
- if (ses->server->sign)
- flags |= NTLMSSP_NEGOTIATE_SIGN;
- if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
- flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+ flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET |
+ NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
sec_blob->NegotiateFlags = cpu_to_le32(flags);
@@ -820,9 +852,9 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
*pbuffer, &tmp,
nls_cp);
- if (((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) ||
- (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
- && !calc_seckey(ses)) {
+ if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+ (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) &&
+ !calc_seckey(ses)) {
memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer);
sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2f5f2c4c6183..8b3670388cda 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -142,7 +142,7 @@ static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
{
- int rc;
+ int rc = 0;
struct nls_table *nls_codepage;
struct cifs_ses *ses;
int retries;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 283c7b94edda..bfac462dd3e8 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -9,6 +9,8 @@
*******************************************************************************
******************************************************************************/
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lock.h"
#include "user.h"
@@ -254,10 +256,12 @@ void dlm_callback_work(struct work_struct *work)
continue;
} else if (callbacks[i].flags & DLM_CB_BAST) {
bastfn(lkb->lkb_astparam, callbacks[i].mode);
+ trace_dlm_bast(ls, lkb, callbacks[i].mode);
} else if (callbacks[i].flags & DLM_CB_CAST) {
lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
castfn(lkb->lkb_astparam);
+ trace_dlm_ast(ls, lkb, lkb->lkb_lksb);
}
}
@@ -295,7 +299,8 @@ void dlm_callback_suspend(struct dlm_ls *ls)
void dlm_callback_resume(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
- int count = 0;
+ int count = 0, sum = 0;
+ bool empty;
clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
@@ -311,14 +316,17 @@ more:
if (count == MAX_CB_QUEUE)
break;
}
+ empty = list_empty(&ls->ls_cb_delay);
mutex_unlock(&ls->ls_cb_mutex);
- if (count)
- log_rinfo(ls, "dlm_callback_resume %d", count);
- if (count == MAX_CB_QUEUE) {
+ sum += count;
+ if (!empty) {
count = 0;
cond_resched();
goto more;
}
+
+ if (sum)
+ log_rinfo(ls, "%s %d", __func__, sum);
}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 47e9d57e4cae..8fb04ebbafb5 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -635,6 +635,35 @@ static int table_open2(struct inode *inode, struct file *file)
return 0;
}
+static ssize_t table_write2(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ int n, len, lkb_nodeid, lkb_status, error;
+ char name[DLM_RESNAME_MAXLEN + 1] = {};
+ struct dlm_ls *ls = seq->private;
+ unsigned int lkb_flags;
+ char buf[256] = {};
+ uint32_t lkb_id;
+
+ if (copy_from_user(buf, user_buf,
+ min_t(size_t, sizeof(buf) - 1, count)))
+ return -EFAULT;
+
+ n = sscanf(buf, "%x %" __stringify(DLM_RESNAME_MAXLEN) "s %x %d %d",
+ &lkb_id, name, &lkb_flags, &lkb_nodeid, &lkb_status);
+ if (n != 5)
+ return -EINVAL;
+
+ len = strnlen(name, DLM_RESNAME_MAXLEN);
+ error = dlm_debug_add_lkb(ls, lkb_id, name, len, lkb_flags,
+ lkb_nodeid, lkb_status);
+ if (error)
+ return error;
+
+ return count;
+}
+
static int table_open3(struct inode *inode, struct file *file)
{
struct seq_file *seq;
@@ -675,6 +704,7 @@ static const struct file_operations format2_fops = {
.owner = THIS_MODULE,
.open = table_open2,
.read = seq_read,
+ .write = table_write2,
.llseek = seq_lseek,
.release = seq_release
};
@@ -724,10 +754,35 @@ static ssize_t waiters_read(struct file *file, char __user *userbuf,
return rv;
}
+static ssize_t waiters_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_ls *ls = file->private_data;
+ int mstype, to_nodeid;
+ char buf[128] = {};
+ uint32_t lkb_id;
+ int n, error;
+
+ if (copy_from_user(buf, user_buf,
+ min_t(size_t, sizeof(buf) - 1, count)))
+ return -EFAULT;
+
+ n = sscanf(buf, "%x %d %d", &lkb_id, &mstype, &to_nodeid);
+ if (n != 3)
+ return -EINVAL;
+
+ error = dlm_debug_add_lkb_to_waiters(ls, lkb_id, mstype, to_nodeid);
+ if (error)
+ return error;
+
+ return count;
+}
+
static const struct file_operations waiters_fops = {
.owner = THIS_MODULE,
.open = simple_open,
.read = waiters_read,
+ .write = waiters_write,
.llseek = default_llseek,
};
@@ -768,6 +823,42 @@ static int dlm_version_show(struct seq_file *file, void *offset)
}
DEFINE_SHOW_ATTRIBUTE(dlm_version);
+static ssize_t dlm_rawmsg_write(struct file *fp, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ void *buf;
+ int ret;
+
+ if (count > PAGE_SIZE || count < sizeof(struct dlm_header))
+ return -EINVAL;
+
+ buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, user_buf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = dlm_midcomms_rawmsg_send(fp->private_data, buf, count);
+ if (ret)
+ goto out;
+
+ kfree(buf);
+ return count;
+
+out:
+ kfree(buf);
+ return ret;
+}
+
+static const struct file_operations dlm_rawmsg_fops = {
+ .open = simple_open,
+ .write = dlm_rawmsg_write,
+ .llseek = no_llseek,
+};
+
void *dlm_create_debug_comms_file(int nodeid, void *data)
{
struct dentry *d_node;
@@ -782,6 +873,7 @@ void *dlm_create_debug_comms_file(int nodeid, void *data)
debugfs_create_file("send_queue_count", 0444, d_node, data,
&dlm_send_queue_cnt_fops);
debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops);
+ debugfs_create_file("rawmsg", 0200, d_node, data, &dlm_rawmsg_fops);
return d_node;
}
@@ -809,7 +901,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name);
ls->ls_debug_locks_dentry = debugfs_create_file(name,
- S_IFREG | S_IRUGO,
+ 0644,
dlm_root,
ls,
&format2_fops);
@@ -840,7 +932,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name);
ls->ls_debug_waiters_dentry = debugfs_create_file(name,
- S_IFREG | S_IRUGO,
+ 0644,
dlm_root,
ls,
&waiters_fops);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 45ebbe602bbf..b6692f81ec83 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -84,8 +84,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
for (;;) {
int left;
- error = dlm_recovery_stopped(ls);
- if (error) {
+ if (dlm_recovery_stopped(ls)) {
error = -EINTR;
goto out_free;
}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5f57538b5d45..74a9590a4dd5 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -41,12 +41,6 @@
#include <linux/dlm.h>
#include "config.h"
-/* Size of the temp buffer midcomms allocates on the stack.
- We try to make this large enough so most messages fit.
- FIXME: should sctp make this unnecessary? */
-
-#define DLM_INBUF_LEN 148
-
struct dlm_ls;
struct dlm_lkb;
struct dlm_rsb;
@@ -554,8 +548,9 @@ struct dlm_ls {
uint32_t ls_generation;
uint32_t ls_exflags;
int ls_lvblen;
- int ls_count; /* refcount of processes in
+ atomic_t ls_count; /* refcount of processes in
the dlm using this ls */
+ wait_queue_head_t ls_count_wait;
int ls_create_count; /* create/release refcount */
unsigned long ls_flags; /* LSFL_ */
unsigned long ls_scan_time;
@@ -581,6 +576,7 @@ struct dlm_ls {
struct list_head ls_new_rsb; /* new rsb structs */
spinlock_t ls_remove_spin;
+ wait_queue_head_t ls_remove_wait;
char ls_remove_name[DLM_RESNAME_MAXLEN+1];
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
int ls_remove_len;
@@ -632,6 +628,8 @@ struct dlm_ls {
struct rw_semaphore ls_in_recovery; /* block local requests */
struct rw_semaphore ls_recv_active; /* block dlm_recv */
struct list_head ls_requestqueue;/* queue remote requests */
+ atomic_t ls_requestqueue_cnt;
+ wait_queue_head_t ls_requestqueue_wait;
struct mutex ls_requestqueue_mutex;
struct dlm_rcom *ls_recover_buf;
int ls_recover_nodeid; /* for debugging */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c502c065d007..bdb51d209ba2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -53,6 +53,8 @@
R: do_xxxx()
L: receive_xxxx_reply() <- R: send_xxxx_reply()
*/
+#include <trace/events/dlm.h>
+
#include <linux/types.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -1178,7 +1180,8 @@ static void detach_lkb(struct dlm_lkb *lkb)
}
}
-static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
+ int start, int end)
{
struct dlm_lkb *lkb;
int rv;
@@ -1199,7 +1202,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
idr_preload(GFP_NOFS);
spin_lock(&ls->ls_lkbidr_spin);
- rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT);
+ rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT);
if (rv >= 0)
lkb->lkb_id = rv;
spin_unlock(&ls->ls_lkbidr_spin);
@@ -1215,6 +1218,11 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
return 0;
}
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ return _create_lkb(ls, lkb_ret, 1, 0);
+}
+
static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb;
@@ -1618,21 +1626,24 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
}
/* If there's an rsb for the same resource being removed, ensure
- that the remove message is sent before the new lookup message.
- It should be rare to need a delay here, but if not, then it may
- be worthwhile to add a proper wait mechanism rather than a delay. */
+ * that the remove message is sent before the new lookup message.
+ */
+
+#define DLM_WAIT_PENDING_COND(ls, r) \
+ (ls->ls_remove_len && \
+ !rsb_cmp(r, ls->ls_remove_name, \
+ ls->ls_remove_len))
static void wait_pending_remove(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
restart:
spin_lock(&ls->ls_remove_spin);
- if (ls->ls_remove_len &&
- !rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) {
+ if (DLM_WAIT_PENDING_COND(ls, r)) {
log_debug(ls, "delay lookup for remove dir %d %s",
- r->res_dir_nodeid, r->res_name);
+ r->res_dir_nodeid, r->res_name);
spin_unlock(&ls->ls_remove_spin);
- msleep(1);
+ wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
goto restart;
}
spin_unlock(&ls->ls_remove_spin);
@@ -1784,6 +1795,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
+ wake_up(&ls->ls_remove_wait);
send_remove(r);
@@ -3437,6 +3449,8 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error)
goto out;
+ trace_dlm_lock_start(ls, lkb, mode, flags);
+
error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
astarg, bast, &args);
if (error)
@@ -3450,6 +3464,8 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error == -EINPROGRESS)
error = 0;
out_put:
+ trace_dlm_lock_end(ls, lkb, mode, flags, error);
+
if (convert || error)
__put_lkb(ls, lkb);
if (error == -EAGAIN || error == -EDEADLK)
@@ -3481,6 +3497,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
error = set_unlock_args(flags, astarg, &args);
if (error)
goto out_put;
@@ -3495,6 +3513,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
error = 0;
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
+
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -3973,6 +3993,14 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
int from = ms->m_header.h_nodeid;
int error = 0;
+ /* currently mixing of user/kernel locks are not supported */
+ if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) {
+ log_error(lkb->lkb_resource->res_ls,
+ "got user dlm message for a kernel lock");
+ error = -EINVAL;
+ goto out;
+ }
+
switch (ms->m_type) {
case DLM_MSG_CONVERT:
case DLM_MSG_UNLOCK:
@@ -4001,6 +4029,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
error = -EINVAL;
}
+out:
if (error)
log_error(lkb->lkb_resource->res_ls,
"ignore invalid message %d from %d %x %x %x %d",
@@ -4050,6 +4079,7 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
+ wake_up(&ls->ls_remove_wait);
rv = _create_message(ls, sizeof(struct dlm_message) + len,
dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
@@ -6301,3 +6331,64 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
return error;
}
+/* debug functionality */
+int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
+ int lkb_nodeid, unsigned int lkb_flags, int lkb_status)
+{
+ struct dlm_lksb *lksb;
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ /* we currently can't set a valid user lock */
+ if (lkb_flags & DLM_IFL_USER)
+ return -EOPNOTSUPP;
+
+ lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
+ if (!lksb)
+ return -ENOMEM;
+
+ error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1);
+ if (error) {
+ kfree(lksb);
+ return error;
+ }
+
+ lkb->lkb_flags = lkb_flags;
+ lkb->lkb_nodeid = lkb_nodeid;
+ lkb->lkb_lksb = lksb;
+ /* user specific pointer, just don't have it NULL for kernel locks */
+ if (~lkb_flags & DLM_IFL_USER)
+ lkb->lkb_astparam = (void *)0xDEADBEEF;
+
+ error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
+ if (error) {
+ kfree(lksb);
+ __put_lkb(ls, lkb);
+ return error;
+ }
+
+ lock_rsb(r);
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, lkb_status);
+ unlock_rsb(r);
+ put_rsb(r);
+
+ return 0;
+}
+
+int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
+ int mstype, int to_nodeid)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, lkb_id, &lkb);
+ if (error)
+ return error;
+
+ error = add_to_waiters(lkb, mstype, to_nodeid);
+ dlm_put_lkb(lkb);
+ return error;
+}
+
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 456c6ec3ef6f..252a5898f908 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -58,6 +58,10 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
int nodeid, int pid);
int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid);
void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
+ int lkb_nodeid, unsigned int lkb_flags, int lkb_status);
+int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
+ int mstype, int to_nodeid);
static inline int is_master(struct dlm_rsb *r)
{
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 10eddfa6c3d7..31384e7d6f90 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -314,7 +314,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_global_id == id) {
- ls->ls_count++;
+ atomic_inc(&ls->ls_count);
goto out;
}
}
@@ -331,7 +331,7 @@ struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_local_handle == lockspace) {
- ls->ls_count++;
+ atomic_inc(&ls->ls_count);
goto out;
}
}
@@ -348,7 +348,7 @@ struct dlm_ls *dlm_find_lockspace_device(int minor)
spin_lock(&lslist_lock);
list_for_each_entry(ls, &lslist, ls_list) {
if (ls->ls_device.minor == minor) {
- ls->ls_count++;
+ atomic_inc(&ls->ls_count);
goto out;
}
}
@@ -360,24 +360,24 @@ struct dlm_ls *dlm_find_lockspace_device(int minor)
void dlm_put_lockspace(struct dlm_ls *ls)
{
- spin_lock(&lslist_lock);
- ls->ls_count--;
- spin_unlock(&lslist_lock);
+ if (atomic_dec_and_test(&ls->ls_count))
+ wake_up(&ls->ls_count_wait);
}
static void remove_lockspace(struct dlm_ls *ls)
{
- for (;;) {
- spin_lock(&lslist_lock);
- if (ls->ls_count == 0) {
- WARN_ON(ls->ls_create_count != 0);
- list_del(&ls->ls_list);
- spin_unlock(&lslist_lock);
- return;
- }
+retry:
+ wait_event(ls->ls_count_wait, atomic_read(&ls->ls_count) == 0);
+
+ spin_lock(&lslist_lock);
+ if (atomic_read(&ls->ls_count) != 0) {
spin_unlock(&lslist_lock);
- ssleep(1);
+ goto retry;
}
+
+ WARN_ON(ls->ls_create_count != 0);
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
}
static int threads_start(void)
@@ -481,7 +481,8 @@ static int new_lockspace(const char *name, const char *cluster,
memcpy(ls->ls_name, name, namelen);
ls->ls_namelen = namelen;
ls->ls_lvblen = lvblen;
- ls->ls_count = 0;
+ atomic_set(&ls->ls_count, 0);
+ init_waitqueue_head(&ls->ls_count_wait);
ls->ls_flags = 0;
ls->ls_scan_time = jiffies;
@@ -511,6 +512,7 @@ static int new_lockspace(const char *name, const char *cluster,
}
spin_lock_init(&ls->ls_remove_spin);
+ init_waitqueue_head(&ls->ls_remove_wait);
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
@@ -564,6 +566,8 @@ static int new_lockspace(const char *name, const char *cluster,
init_rwsem(&ls->ls_in_recovery);
init_rwsem(&ls->ls_recv_active);
INIT_LIST_HEAD(&ls->ls_requestqueue);
+ atomic_set(&ls->ls_requestqueue_cnt, 0);
+ init_waitqueue_head(&ls->ls_requestqueue_wait);
mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks);
@@ -868,7 +872,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
* until this returns.
*
* Force has 4 possible values:
- * 0 - don't destroy locksapce if it has any LKBs
+ * 0 - don't destroy lockspace if it has any LKBs
* 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
* 2 - destroy lockspace regardless of LKBs
* 3 - destroy lockspace as part of a forced shutdown
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 8f715c620e1f..e284d696c1fd 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -53,9 +53,12 @@
#include <net/sctp/sctp.h>
#include <net/ipv6.h>
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lowcomms.h"
#include "midcomms.h"
+#include "memory.h"
#include "config.h"
#define NEEDED_RMEM (4*1024*1024)
@@ -84,7 +87,6 @@ struct connection {
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
atomic_t writequeue_cnt;
- struct mutex wq_alloc;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -189,6 +191,24 @@ static const struct dlm_proto_ops *dlm_proto_ops;
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
+static void writequeue_entry_ctor(void *data)
+{
+ struct writequeue_entry *entry = data;
+
+ INIT_LIST_HEAD(&entry->msgs);
+}
+
+struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void)
+{
+ return kmem_cache_create("dlm_writequeue", sizeof(struct writequeue_entry),
+ 0, 0, writequeue_entry_ctor);
+}
+
+struct kmem_cache *dlm_lowcomms_msg_cache_create(void)
+{
+ return kmem_cache_create("dlm_msg", sizeof(struct dlm_msg), 0, 0, NULL);
+}
+
/* need to held writequeue_lock */
static struct writequeue_entry *con_next_wq(struct connection *con)
{
@@ -199,7 +219,10 @@ static struct writequeue_entry *con_next_wq(struct connection *con)
e = list_first_entry(&con->writequeue, struct writequeue_entry,
list);
- if (e->len == 0)
+ /* if len is zero nothing is to send, if there are users filling
+ * buffers we wait until the users are done so we can send more.
+ */
+ if (e->users || e->len == 0)
return NULL;
return e;
@@ -265,8 +288,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return NULL;
}
- mutex_init(&con->wq_alloc);
-
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
* race on multiple cpu's. Instead of locking hot path __find_con()
@@ -486,11 +507,9 @@ static void lowcomms_data_ready(struct sock *sk)
{
struct connection *con;
- read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
queue_work(recv_workqueue, &con->rwork);
- read_unlock_bh(&sk->sk_callback_lock);
}
static void lowcomms_listen_data_ready(struct sock *sk)
@@ -505,15 +524,14 @@ static void lowcomms_write_space(struct sock *sk)
{
struct connection *con;
- read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (!con)
- goto out;
+ return;
if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
log_print("successful connected to node %d", con->nodeid);
queue_work(send_workqueue, &con->swork);
- goto out;
+ return;
}
clear_bit(SOCK_NOSPACE, &con->sock->flags);
@@ -524,8 +542,6 @@ static void lowcomms_write_space(struct sock *sk)
}
queue_work(send_workqueue, &con->swork);
-out:
- read_unlock_bh(&sk->sk_callback_lock);
}
static inline void lowcomms_connect_sock(struct connection *con)
@@ -592,42 +608,41 @@ int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
static void lowcomms_error_report(struct sock *sk)
{
struct connection *con;
- struct sockaddr_storage saddr;
void (*orig_report)(struct sock *) = NULL;
+ struct inet_sock *inet;
- read_lock_bh(&sk->sk_callback_lock);
con = sock2con(sk);
if (con == NULL)
goto out;
orig_report = listen_sock.sk_error_report;
- if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) {
- printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
- "sending to node %d, port %d, "
- "sk_err=%d/%d\n", dlm_our_nodeid(),
- con->nodeid, dlm_config.ci_tcp_port,
- sk->sk_err, sk->sk_err_soft);
- } else if (saddr.ss_family == AF_INET) {
- struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
+ inet = inet_sk(sk);
+ switch (sk->sk_family) {
+ case AF_INET:
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
- "sending to node %d at %pI4, port %d, "
+ "sending to node %d at %pI4, dport %d, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
- con->nodeid, &sin4->sin_addr.s_addr,
- dlm_config.ci_tcp_port, sk->sk_err,
+ con->nodeid, &inet->inet_daddr,
+ ntohs(inet->inet_dport), sk->sk_err,
sk->sk_err_soft);
- } else {
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr;
-
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
- "sending to node %d at %u.%u.%u.%u, "
- "port %d, sk_err=%d/%d\n", dlm_our_nodeid(),
- con->nodeid, sin6->sin6_addr.s6_addr32[0],
- sin6->sin6_addr.s6_addr32[1],
- sin6->sin6_addr.s6_addr32[2],
- sin6->sin6_addr.s6_addr32[3],
- dlm_config.ci_tcp_port, sk->sk_err,
+ "sending to node %d at %pI6c, "
+ "dport %d, sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport), sk->sk_err,
sk->sk_err_soft);
+ break;
+#endif
+ default:
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "invalid socket family %d set, "
+ "sk_err=%d/%d\n", dlm_our_nodeid(),
+ sk->sk_family, sk->sk_err, sk->sk_err_soft);
+ goto out;
}
/* below sendcon only handling */
@@ -646,7 +661,6 @@ static void lowcomms_error_report(struct sock *sk)
queue_work(send_workqueue, &con->swork);
out:
- read_unlock_bh(&sk->sk_callback_lock);
if (orig_report)
orig_report(sk);
}
@@ -666,20 +680,20 @@ static void restore_callbacks(struct socket *sock)
{
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = listen_sock.sk_data_ready;
sk->sk_state_change = listen_sock.sk_state_change;
sk->sk_write_space = listen_sock.sk_write_space;
sk->sk_error_report = listen_sock.sk_error_report;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
static void add_listen_sock(struct socket *sock, struct listen_connection *con)
{
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
save_listen_callbacks(sock);
con->sock = sock;
@@ -687,7 +701,7 @@ static void add_listen_sock(struct socket *sock, struct listen_connection *con)
sk->sk_allocation = GFP_NOFS;
/* Install a data_ready callback */
sk->sk_data_ready = lowcomms_listen_data_ready;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
/* Make a socket active */
@@ -695,7 +709,7 @@ static void add_sock(struct socket *sock, struct connection *con)
{
struct sock *sk = sock->sk;
- write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
con->sock = sock;
sk->sk_user_data = con;
@@ -705,7 +719,7 @@ static void add_sock(struct socket *sock, struct connection *con)
sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
sk->sk_error_report = lowcomms_error_report;
- write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
}
/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -733,7 +747,7 @@ static void dlm_page_release(struct kref *kref)
ref);
__free_page(e->page);
- kfree(e);
+ dlm_free_writequeue(e);
}
static void dlm_msg_release(struct kref *kref)
@@ -741,7 +755,7 @@ static void dlm_msg_release(struct kref *kref)
struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref);
kref_put(&msg->entry->ref, dlm_page_release);
- kfree(msg);
+ dlm_free_msg(msg);
}
static void free_entry(struct writequeue_entry *e)
@@ -925,6 +939,7 @@ static int receive_from_sock(struct connection *con)
msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
msg.msg_flags);
+ trace_dlm_recv(con->nodeid, ret);
if (ret == -EAGAIN)
break;
else if (ret <= 0)
@@ -1013,10 +1028,28 @@ static int accept_from_sock(struct listen_connection *con)
/* Get the new node's NODEID */
make_sockaddr(&peeraddr, 0, &len);
if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) {
- unsigned char *b=(unsigned char *)&peeraddr;
- log_print("connect from non cluster node");
- print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
- b, sizeof(struct sockaddr_storage));
+ switch (peeraddr.ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&peeraddr;
+
+ log_print("connect from non cluster IPv4 node %pI4",
+ &sin->sin_addr);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&peeraddr;
+
+ log_print("connect from non cluster IPv6 node %pI6c",
+ &sin6->sin6_addr);
+ break;
+ }
+#endif
+ default:
+ log_print("invalid family from non cluster node");
+ break;
+ }
+
sock_release(newsock);
return -1;
}
@@ -1177,33 +1210,33 @@ static void deinit_local(void)
kfree(dlm_local_addr[i]);
}
-static struct writequeue_entry *new_writequeue_entry(struct connection *con,
- gfp_t allocation)
+static struct writequeue_entry *new_writequeue_entry(struct connection *con)
{
struct writequeue_entry *entry;
- entry = kzalloc(sizeof(*entry), allocation);
+ entry = dlm_allocate_writequeue();
if (!entry)
return NULL;
- entry->page = alloc_page(allocation | __GFP_ZERO);
+ entry->page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
if (!entry->page) {
- kfree(entry);
+ dlm_free_writequeue(entry);
return NULL;
}
+ entry->offset = 0;
+ entry->len = 0;
+ entry->end = 0;
+ entry->dirty = false;
entry->con = con;
entry->users = 1;
kref_init(&entry->ref);
- INIT_LIST_HEAD(&entry->msgs);
-
return entry;
}
static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
- gfp_t allocation, char **ppc,
- void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh)
+ char **ppc, void (*cb)(void *data),
+ void *data)
{
struct writequeue_entry *e;
@@ -1215,74 +1248,54 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
*ppc = page_address(e->page) + e->end;
if (cb)
- cb(mh);
+ cb(data);
e->end += len;
e->users++;
- spin_unlock(&con->writequeue_lock);
-
- return e;
+ goto out;
}
}
- spin_unlock(&con->writequeue_lock);
- e = new_writequeue_entry(con, allocation);
+ e = new_writequeue_entry(con);
if (!e)
- return NULL;
+ goto out;
kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
atomic_inc(&con->writequeue_cnt);
-
- spin_lock(&con->writequeue_lock);
if (cb)
- cb(mh);
+ cb(data);
list_add_tail(&e->list, &con->writequeue);
- spin_unlock(&con->writequeue_lock);
+out:
+ spin_unlock(&con->writequeue_lock);
return e;
};
static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
gfp_t allocation, char **ppc,
- void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh)
+ void (*cb)(void *data),
+ void *data)
{
struct writequeue_entry *e;
struct dlm_msg *msg;
- bool sleepable;
- msg = kzalloc(sizeof(*msg), allocation);
+ msg = dlm_allocate_msg(allocation);
if (!msg)
return NULL;
- /* this mutex is being used as a wait to avoid multiple "fast"
- * new writequeue page list entry allocs in new_wq_entry in
- * normal operation which is sleepable context. Without it
- * we could end in multiple writequeue entries with one
- * dlm message because multiple callers were waiting at
- * the writequeue_lock in new_wq_entry().
- */
- sleepable = gfpflags_normal_context(allocation);
- if (sleepable)
- mutex_lock(&con->wq_alloc);
-
kref_init(&msg->ref);
- e = new_wq_entry(con, len, allocation, ppc, cb, mh);
+ e = new_wq_entry(con, len, ppc, cb, data);
if (!e) {
- if (sleepable)
- mutex_unlock(&con->wq_alloc);
-
- kfree(msg);
+ dlm_free_msg(msg);
return NULL;
}
- if (sleepable)
- mutex_unlock(&con->wq_alloc);
-
+ msg->retransmit = false;
+ msg->orig_msg = NULL;
msg->ppc = *ppc;
msg->len = len;
msg->entry = e;
@@ -1291,8 +1304,8 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
}
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
- char **ppc, void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh)
+ char **ppc, void (*cb)(void *data),
+ void *data)
{
struct connection *con;
struct dlm_msg *msg;
@@ -1313,7 +1326,7 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
return NULL;
}
- msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh);
+ msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, data);
if (!msg) {
srcu_read_unlock(&connections_srcu, idx);
return NULL;
@@ -1403,7 +1416,6 @@ static void send_to_sock(struct connection *con)
if (!e)
break;
- e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
len = e->len;
offset = e->offset;
BUG_ON(len == 0 && e->users == 0);
@@ -1411,6 +1423,7 @@ static void send_to_sock(struct connection *con)
ret = kernel_sendpage(con->sock, e->page, offset, len,
msg_flags);
+ trace_dlm_send(con->nodeid, ret);
if (ret == -EAGAIN || ret == 0) {
if (ret == -EAGAIN &&
test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
@@ -1680,9 +1693,9 @@ static void _stop_conn(struct connection *con, bool and_other)
set_bit(CF_READ_PENDING, &con->flags);
set_bit(CF_WRITE_PENDING, &con->flags);
if (con->sock && con->sock->sk) {
- write_lock_bh(&con->sock->sk->sk_callback_lock);
+ lock_sock(con->sock->sk);
con->sock->sk->sk_user_data = NULL;
- write_unlock_bh(&con->sock->sk->sk_callback_lock);
+ release_sock(con->sock->sk);
}
if (con->othercon && and_other)
_stop_conn(con->othercon, false);
@@ -1775,7 +1788,7 @@ static int dlm_listen_for_all(void)
result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
- log_print("Can't create comms socket, check SCTP is loaded");
+ log_print("Can't create comms socket: %d", result);
goto out;
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 4ccae07cf005..29369feea991 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -38,8 +38,8 @@ void dlm_lowcomms_stop(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
- char **ppc, void (*cb)(struct dlm_mhandle *mh),
- struct dlm_mhandle *mh);
+ char **ppc, void (*cb)(void *data),
+ void *data);
void dlm_lowcomms_commit_msg(struct dlm_msg *msg);
void dlm_lowcomms_put_msg(struct dlm_msg *msg);
int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
@@ -47,6 +47,8 @@ int dlm_lowcomms_connect_node(int nodeid);
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
void dlm_midcomms_receive_done(int nodeid);
+struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void);
+struct kmem_cache *dlm_lowcomms_msg_cache_create(void);
#endif /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index afc66a1346d3..1c5be4b70ac1 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -19,6 +19,9 @@
#include "config.h"
#include "lowcomms.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/dlm.h>
+
static int __init init_dlm(void)
{
int error;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 731d489aa323..61f906e705db 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -442,8 +442,7 @@ static int ping_members(struct dlm_ls *ls)
int error = 0;
list_for_each_entry(memb, &ls->ls_nodes, list) {
- error = dlm_recovery_stopped(ls);
- if (error) {
+ if (dlm_recovery_stopped(ls)) {
error = -EINTR;
break;
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 5918f4d39586..ce35c3c19aeb 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -10,32 +10,61 @@
******************************************************************************/
#include "dlm_internal.h"
+#include "midcomms.h"
+#include "lowcomms.h"
#include "config.h"
#include "memory.h"
+static struct kmem_cache *writequeue_cache;
+static struct kmem_cache *mhandle_cache;
+static struct kmem_cache *msg_cache;
static struct kmem_cache *lkb_cache;
static struct kmem_cache *rsb_cache;
int __init dlm_memory_init(void)
{
+ writequeue_cache = dlm_lowcomms_writequeue_cache_create();
+ if (!writequeue_cache)
+ goto out;
+
+ mhandle_cache = dlm_midcomms_cache_create();
+ if (!mhandle_cache)
+ goto mhandle;
+
lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
__alignof__(struct dlm_lkb), 0, NULL);
if (!lkb_cache)
- return -ENOMEM;
+ goto lkb;
+
+ msg_cache = dlm_lowcomms_msg_cache_create();
+ if (!msg_cache)
+ goto msg;
rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb),
__alignof__(struct dlm_rsb), 0, NULL);
- if (!rsb_cache) {
- kmem_cache_destroy(lkb_cache);
- return -ENOMEM;
- }
+ if (!rsb_cache)
+ goto rsb;
return 0;
+
+rsb:
+ kmem_cache_destroy(msg_cache);
+msg:
+ kmem_cache_destroy(lkb_cache);
+lkb:
+ kmem_cache_destroy(mhandle_cache);
+mhandle:
+ kmem_cache_destroy(writequeue_cache);
+out:
+ return -ENOMEM;
}
void dlm_memory_exit(void)
{
+ kmem_cache_destroy(writequeue_cache);
+ kmem_cache_destroy(mhandle_cache);
+ kmem_cache_destroy(msg_cache);
kmem_cache_destroy(lkb_cache);
kmem_cache_destroy(rsb_cache);
}
@@ -89,3 +118,32 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
kmem_cache_free(lkb_cache, lkb);
}
+struct dlm_mhandle *dlm_allocate_mhandle(void)
+{
+ return kmem_cache_alloc(mhandle_cache, GFP_NOFS);
+}
+
+void dlm_free_mhandle(struct dlm_mhandle *mhandle)
+{
+ kmem_cache_free(mhandle_cache, mhandle);
+}
+
+struct writequeue_entry *dlm_allocate_writequeue(void)
+{
+ return kmem_cache_alloc(writequeue_cache, GFP_ATOMIC);
+}
+
+void dlm_free_writequeue(struct writequeue_entry *writequeue)
+{
+ kmem_cache_free(writequeue_cache, writequeue);
+}
+
+struct dlm_msg *dlm_allocate_msg(gfp_t allocation)
+{
+ return kmem_cache_alloc(msg_cache, allocation);
+}
+
+void dlm_free_msg(struct dlm_msg *msg)
+{
+ kmem_cache_free(msg_cache, msg);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 4f218ea4b187..7bd3f1a391ca 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -20,6 +20,12 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
void dlm_free_lkb(struct dlm_lkb *l);
char *dlm_allocate_lvb(struct dlm_ls *ls);
void dlm_free_lvb(char *l);
+struct dlm_mhandle *dlm_allocate_mhandle(void);
+void dlm_free_mhandle(struct dlm_mhandle *mhandle);
+struct writequeue_entry *dlm_allocate_writequeue(void);
+void dlm_free_writequeue(struct writequeue_entry *writequeue);
+struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
+void dlm_free_msg(struct dlm_msg *msg);
#endif /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 7ae39ec8d9b0..3635e42b0669 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -137,6 +137,7 @@
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
+#include "memory.h"
#include "lock.h"
#include "util.h"
#include "midcomms.h"
@@ -220,6 +221,12 @@ DEFINE_STATIC_SRCU(nodes_srcu);
*/
static DEFINE_MUTEX(close_lock);
+struct kmem_cache *dlm_midcomms_cache_create(void)
+{
+ return kmem_cache_create("dlm_mhandle", sizeof(struct dlm_mhandle),
+ 0, 0, NULL);
+}
+
static inline const char *dlm_state_str(int state)
{
switch (state) {
@@ -279,7 +286,7 @@ static void dlm_mhandle_release(struct rcu_head *rcu)
struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu);
dlm_lowcomms_put_msg(mh->msg);
- kfree(mh);
+ dlm_free_mhandle(mh);
}
static void dlm_mhandle_delete(struct midcomms_node *node,
@@ -909,11 +916,11 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
- switch (le32_to_cpu(hd->h_version)) {
- case DLM_VERSION_3_1:
+ switch (hd->h_version) {
+ case cpu_to_le32(DLM_VERSION_3_1):
dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
break;
- case DLM_VERSION_3_2:
+ case cpu_to_le32(DLM_VERSION_3_2):
dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid);
break;
default:
@@ -969,7 +976,7 @@ void dlm_midcomms_receive_done(int nodeid)
spin_unlock(&node->state_lock);
/* do nothing FIN has it's own ack send */
break;
- };
+ }
srcu_read_unlock(&nodes_srcu, idx);
}
@@ -1020,8 +1027,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
header_out(&opts->o_header);
}
-static void midcomms_new_msg_cb(struct dlm_mhandle *mh)
+static void midcomms_new_msg_cb(void *data)
{
+ struct dlm_mhandle *mh = data;
+
atomic_inc(&mh->node->send_queue_cnt);
spin_lock(&mh->node->send_queue_lock);
@@ -1071,10 +1080,12 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
/* this is a bug, however we going on and hope it will be resolved */
WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
- mh = kzalloc(sizeof(*mh), GFP_NOFS);
+ mh = dlm_allocate_mhandle();
if (!mh)
goto err;
+ mh->committed = false;
+ mh->ack_rcv = NULL;
mh->idx = idx;
mh->node = node;
@@ -1083,7 +1094,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc,
NULL, NULL);
if (!msg) {
- kfree(mh);
+ dlm_free_mhandle(mh);
goto err;
}
@@ -1092,13 +1103,13 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
ppc);
if (!msg) {
- kfree(mh);
+ dlm_free_mhandle(mh);
goto err;
}
break;
default:
- kfree(mh);
+ dlm_free_mhandle(mh);
WARN_ON(1);
goto err;
}
@@ -1134,7 +1145,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
dlm_lowcomms_commit_msg(mh->msg);
dlm_lowcomms_put_msg(mh->msg);
/* mh is not part of rcu list in this case */
- kfree(mh);
+ dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
dlm_midcomms_commit_msg_3_2(mh);
@@ -1231,7 +1242,7 @@ void dlm_midcomms_add_member(int nodeid)
}
node->users++;
- pr_debug("users inc count %d\n", node->users);
+ pr_debug("node %d users inc count %d\n", nodeid, node->users);
spin_unlock(&node->state_lock);
srcu_read_unlock(&nodes_srcu, idx);
@@ -1254,7 +1265,7 @@ void dlm_midcomms_remove_member(int nodeid)
spin_lock(&node->state_lock);
node->users--;
- pr_debug("users dec count %d\n", node->users);
+ pr_debug("node %d users dec count %d\n", nodeid, node->users);
/* hitting users count to zero means the
* other side is running dlm_midcomms_stop()
@@ -1425,3 +1436,51 @@ int dlm_midcomms_close(int nodeid)
return ret;
}
+
+/* debug functionality to send raw dlm msg from user space */
+struct dlm_rawmsg_data {
+ struct midcomms_node *node;
+ void *buf;
+};
+
+static void midcomms_new_rawmsg_cb(void *data)
+{
+ struct dlm_rawmsg_data *rd = data;
+ struct dlm_header *h = rd->buf;
+
+ switch (h->h_version) {
+ case cpu_to_le32(DLM_VERSION_3_1):
+ break;
+ default:
+ switch (h->h_cmd) {
+ case DLM_OPTS:
+ if (!h->u.h_seq)
+ h->u.h_seq = rd->node->seq_send++;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+}
+
+int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf,
+ int buflen)
+{
+ struct dlm_rawmsg_data rd;
+ struct dlm_msg *msg;
+ char *msgbuf;
+
+ rd.node = node;
+ rd.buf = buf;
+
+ msg = dlm_lowcomms_new_msg(node->nodeid, buflen, GFP_NOFS,
+ &msgbuf, midcomms_new_rawmsg_cb, &rd);
+ if (!msg)
+ return -ENOMEM;
+
+ memcpy(msgbuf, buf, buflen);
+ dlm_lowcomms_commit_msg(msg);
+ return 0;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 579abc6929be..82bcd9661922 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -28,6 +28,9 @@ const char *dlm_midcomms_state(struct midcomms_node *node);
unsigned long dlm_midcomms_flags(struct midcomms_node *node);
int dlm_midcomms_send_queue_cnt(struct midcomms_node *node);
uint32_t dlm_midcomms_version(struct midcomms_node *node);
+int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf,
+ int buflen);
+struct kmem_cache *dlm_midcomms_cache_create(void);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 6cba86470278..5821b777a1a7 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -601,7 +601,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
spin_lock(&ls->ls_recover_lock);
status = ls->ls_recover_status;
- stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
+ stop = dlm_recovery_stopped(ls);
seq = ls->ls_recover_seq;
spin_unlock(&ls->ls_recover_lock);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 97d052cea5a9..a55dfce705dd 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -124,8 +124,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_recover_waiters_pre(ls);
- error = dlm_recovery_stopped(ls);
- if (error) {
+ if (dlm_recovery_stopped(ls)) {
error = -EINTR;
goto fail;
}
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index e89e0ff8bfa3..ccb5307c21e9 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -44,6 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->nodeid = nodeid;
memcpy(&e->request, ms, ms->m_header.h_length);
+ atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
list_add_tail(&e->list, &ls->ls_requestqueue);
mutex_unlock(&ls->ls_requestqueue_mutex);
@@ -89,6 +90,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
mutex_lock(&ls->ls_requestqueue_mutex);
list_del(&e->list);
+ if (atomic_dec_and_test(&ls->ls_requestqueue_cnt))
+ wake_up(&ls->ls_requestqueue_wait);
kfree(e);
if (dlm_locking_stopped(ls)) {
@@ -115,14 +118,8 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
void dlm_wait_requestqueue(struct dlm_ls *ls)
{
- for (;;) {
- mutex_lock(&ls->ls_requestqueue_mutex);
- if (list_empty(&ls->ls_requestqueue))
- break;
- mutex_unlock(&ls->ls_requestqueue_mutex);
- schedule();
- }
- mutex_unlock(&ls->ls_requestqueue_mutex);
+ wait_event(ls->ls_requestqueue_wait,
+ atomic_read(&ls->ls_requestqueue_cnt) == 0);
}
static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
@@ -130,7 +127,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
uint32_t type = ms->m_type;
/* the ls is being cleaned up and freed by release_lockspace */
- if (!ls->ls_count)
+ if (!atomic_read(&ls->ls_count))
return 1;
if (dlm_is_removed(ls, nodeid))
@@ -161,6 +158,8 @@ void dlm_purge_requestqueue(struct dlm_ls *ls)
if (purge_request(ls, ms, e->nodeid)) {
list_del(&e->list);
+ if (atomic_dec_and_test(&ls->ls_requestqueue_cnt))
+ wake_up(&ls->ls_requestqueue_wait);
kfree(e);
}
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d66bbd2df191..2dd23a82e0de 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -537,7 +537,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out_free;
}
- if (mnt_user_ns(path.mnt) != &init_user_ns) {
+ if (is_idmapped_mnt(path.mnt)) {
rc = -EINVAL;
printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n");
goto out_free;
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 756fe2d65272..8a3317e38e5a 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 579406504919..19e6c56a9f47 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -12,7 +12,7 @@ struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
- unsigned short pageofs_out;
+ unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
/* indicate the algorithm will be used for decompression */
@@ -87,6 +87,8 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
return page->mapping == MNGD_MAPPING(sbi);
}
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize);
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool);
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0e35ef3f9f3d..e18476c85fa2 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -9,37 +9,71 @@
#include <linux/dax.h>
#include <trace/events/erofs.h>
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
+void erofs_unmap_metabuf(struct erofs_buf *buf)
+{
+ if (buf->kmap_type == EROFS_KMAP)
+ kunmap(buf->page);
+ else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
+ kunmap_atomic(buf->base);
+ buf->base = NULL;
+ buf->kmap_type = EROFS_NO_KMAP;
+}
+
+void erofs_put_metabuf(struct erofs_buf *buf)
+{
+ if (!buf->page)
+ return;
+ erofs_unmap_metabuf(buf);
+ put_page(buf->page);
+ buf->page = NULL;
+}
+
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
- struct page *page;
-
- page = read_cache_page_gfp(mapping, blkaddr,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- /* should already be PageUptodate */
- if (!IS_ERR(page))
- lock_page(page);
- return page;
+ erofs_off_t offset = blknr_to_addr(blkaddr);
+ pgoff_t index = offset >> PAGE_SHIFT;
+ struct page *page = buf->page;
+
+ if (!page || page->index != index) {
+ erofs_put_metabuf(buf);
+ page = read_cache_page_gfp(mapping, index,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+ if (IS_ERR(page))
+ return page;
+ /* should already be PageUptodate, no need to lock page */
+ buf->page = page;
+ }
+ if (buf->kmap_type == EROFS_NO_KMAP) {
+ if (type == EROFS_KMAP)
+ buf->base = kmap(page);
+ else if (type == EROFS_KMAP_ATOMIC)
+ buf->base = kmap_atomic(page);
+ buf->kmap_type = type;
+ } else if (buf->kmap_type != type) {
+ DBG_BUGON(1);
+ return ERR_PTR(-EFAULT);
+ }
+ if (type == EROFS_NO_KMAP)
+ return NULL;
+ return buf->base + (offset & ~PAGE_MASK);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
{
- int err = 0;
erofs_blk_t nblocks, lastblk;
u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
- trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
-
- nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+ nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
lastblk = nblocks - tailendpacking;
/* there is no hole in flatmode */
map->m_flags = EROFS_MAP_MAPPED;
-
if (offset < blknr_to_addr(lastblk)) {
map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
map->m_plen = blknr_to_addr(lastblk) - offset;
@@ -51,30 +85,23 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
vi->xattr_isize + erofs_blkoff(map->m_la);
map->m_plen = inode->i_size - offset;
- /* inline data should be located in one meta block */
- if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+ /* inline data should be located in the same meta block */
+ if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) {
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
vi->nid);
DBG_BUGON(1);
- err = -EFSCORRUPTED;
- goto err_out;
+ return -EFSCORRUPTED;
}
-
map->m_flags |= EROFS_MAP_META;
} else {
erofs_err(inode->i_sb,
"internal error @ nid: %llu (size %llu), m_la 0x%llx",
vi->nid, inode->i_size, map->m_la);
DBG_BUGON(1);
- err = -EIO;
- goto err_out;
+ return -EIO;
}
-
- map->m_llen = map->m_plen;
-err_out:
- trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
- return err;
+ return 0;
}
static int erofs_map_blocks(struct inode *inode,
@@ -83,12 +110,14 @@ static int erofs_map_blocks(struct inode *inode,
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
u64 chunknr;
unsigned int unit;
erofs_off_t pos;
+ void *kaddr;
int err = 0;
+ trace_erofs_map_blocks_enter(inode, map, flags);
map->m_deviceid = 0;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
@@ -97,8 +126,10 @@ static int erofs_map_blocks(struct inode *inode,
goto out;
}
- if (vi->datalayout != EROFS_INODE_CHUNK_BASED)
- return erofs_map_blocks_flatmode(inode, map, flags);
+ if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
+ err = erofs_map_blocks_flatmode(inode, map, flags);
+ goto out;
+ }
if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
unit = sizeof(*idx); /* chunk index */
@@ -109,17 +140,18 @@ static int erofs_map_blocks(struct inode *inode,
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos));
- if (IS_ERR(page))
- return PTR_ERR(page);
-
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
+ goto out;
+ }
map->m_la = chunknr << vi->chunkbits;
map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = page_address(page) + erofs_blkoff(pos);
+ __le32 *blkaddr = kaddr + erofs_blkoff(pos);
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
@@ -130,7 +162,7 @@ static int erofs_map_blocks(struct inode *inode,
goto out_unlock;
}
/* parse chunk indexes */
- idx = page_address(page) + erofs_blkoff(pos);
+ idx = kaddr + erofs_blkoff(pos);
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
@@ -143,10 +175,11 @@ static int erofs_map_blocks(struct inode *inode,
break;
}
out_unlock:
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
out:
- map->m_llen = map->m_plen;
+ if (!err)
+ map->m_llen = map->m_plen;
+ trace_erofs_map_blocks_exit(inode, map, flags, 0);
return err;
}
@@ -231,16 +264,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
}
if (map.m_flags & EROFS_MAP_META) {
- struct page *ipage;
+ void *ptr;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ipage = erofs_get_meta_page(inode->i_sb,
- erofs_blknr(mdev.m_pa));
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- iomap->inline_data = page_address(ipage) +
- erofs_blkoff(mdev.m_pa);
- iomap->private = ipage;
+ ptr = erofs_read_metabuf(&buf, inode->i_sb,
+ erofs_blknr(mdev.m_pa), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa);
+ iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = mdev.m_pa;
@@ -251,12 +284,17 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
- struct page *ipage = iomap->private;
+ void *ptr = iomap->private;
+
+ if (ptr) {
+ struct erofs_buf buf = {
+ .page = kmap_to_page(ptr),
+ .base = ptr,
+ .kmap_type = EROFS_KMAP,
+ };
- if (ipage) {
DBG_BUGON(iomap->type != IOMAP_INLINE);
- unlock_page(ipage);
- put_page(ipage);
+ erofs_put_metabuf(&buf);
} else {
DBG_BUGON(iomap->type == IOMAP_INLINE);
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index bf37fc76b182..3efa686c7644 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -16,6 +16,14 @@
#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
#endif
+struct z_erofs_lz4_decompress_ctx {
+ struct z_erofs_decompress_req *rq;
+ /* # of encoded, decoded pages */
+ unsigned int inpages, outpages;
+ /* decoded block total length (used for in-place decompression) */
+ unsigned int oend;
+};
+
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int size)
@@ -56,11 +64,10 @@ int z_erofs_load_lz4_config(struct super_block *sb,
* Fill all gaps with bounce pages if it's a sparse page list. Also check if
* all physical pages are consecutive, which can be seen for moderate CR.
*/
-static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
struct page **pagepool)
{
- const unsigned int nr =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
@@ -70,7 +77,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < nr; ++i, ++j) {
+ for (i = j = 0; i < ctx->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
@@ -112,41 +119,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
void *inpage, unsigned int *inputmargin, int *maptype,
- bool support_0padding)
+ bool may_inplace)
{
- unsigned int nrpages_in, nrpages_out;
- unsigned int ofull, oend, inputsize, total, i, j;
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ unsigned int omargin, total, i, j;
struct page **in;
void *src, *tmp;
- inputsize = rq->inputsize;
- nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
- oend = rq->pageofs_out + rq->outputsize;
- ofull = PAGE_ALIGN(oend);
- nrpages_out = ofull >> PAGE_SHIFT;
-
if (rq->inplace_io) {
- if (rq->partial_decoding || !support_0padding ||
- ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+ omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ if (rq->partial_decoding || !may_inplace ||
+ omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
goto docopy;
- for (i = 0; i < nrpages_in; ++i) {
+ for (i = 0; i < ctx->inpages; ++i) {
DBG_BUGON(rq->in[i] == NULL);
- for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+ for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j)
if (rq->out[j] == rq->in[i])
goto docopy;
}
}
- if (nrpages_in <= 1) {
+ if (ctx->inpages <= 1) {
*maptype = 0;
return inpage;
}
kunmap_atomic(inpage);
might_sleep();
- src = erofs_vm_map_ram(rq->in, nrpages_in);
+ src = erofs_vm_map_ram(rq->in, ctx->inpages);
if (!src)
return ERR_PTR(-ENOMEM);
*maptype = 1;
@@ -155,7 +157,7 @@ static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = erofs_get_pcpubuf(nrpages_in);
+ src = erofs_get_pcpubuf(ctx->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_atomic(inpage);
@@ -182,36 +184,53 @@ docopy:
return src;
}
-static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
+/*
+ * Get the exact inputsize with zero_padding feature.
+ * - For LZ4, it should work if zero_padding feature is on (5.3+);
+ * - For MicroLZMA, it'd be enabled all the time.
+ */
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize)
+{
+ const char *padend;
+
+ padend = memchr_inv(padbuf, 0, padbufsize);
+ if (!padend)
+ return -EFSCORRUPTED;
+ rq->inputsize -= padend - padbuf;
+ rq->pageofs_in += padend - padbuf;
+ return 0;
+}
+
+static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
u8 *out)
{
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
u8 *headpage, *src;
- bool support_0padding;
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
headpage = kmap_atomic(*rq->in);
- inputmargin = 0;
- support_0padding = false;
- /* decompression inplace is only safe when 0padding is enabled */
- if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
+ /* LZ4 decompression inplace is only safe if zero_padding is enabled */
+ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
support_0padding = true;
-
- while (!headpage[inputmargin & ~PAGE_MASK])
- if (!(++inputmargin & ~PAGE_MASK))
- break;
-
- if (inputmargin >= rq->inputsize) {
+ ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ EROFS_BLKSIZ - rq->pageofs_in));
+ if (ret) {
kunmap_atomic(headpage);
- return -EIO;
+ return ret;
}
+ may_inplace = !((rq->pageofs_in + rq->inputsize) &
+ (EROFS_BLKSIZ - 1));
}
- rq->inputsize -= inputmargin;
- src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin,
- &maptype, support_0padding);
+ inputmargin = rq->pageofs_in;
+ src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin,
+ &maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -240,9 +259,9 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
}
if (maptype == 0) {
- kunmap_atomic(src);
+ kunmap_atomic(headpage);
} else if (maptype == 1) {
- vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
+ vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
erofs_put_pcpubuf(src);
} else {
@@ -255,14 +274,18 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq,
static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
int ret;
+ ctx.rq = rq;
+ ctx.oend = rq->pageofs_out + rq->outputsize;
+ ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
+ ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+
/* one optimized fast path only for non bigpcluster cases yet */
- if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) {
+ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
dst = kmap_atomic(*rq->out);
dst_maptype = 0;
@@ -270,27 +293,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
}
/* general decoding path which can be used for all cases */
- ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
- if (ret < 0)
+ ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
+ if (ret < 0) {
return ret;
- if (ret) {
+ } else if (ret > 0) {
dst = page_address(*rq->out);
dst_maptype = 1;
- goto dstmap_out;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, ctx.outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
}
- dst = erofs_vm_map_ram(rq->out, nrpages_out);
- if (!dst)
- return -ENOMEM;
- dst_maptype = 2;
-
dstmap_out:
- ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out);
-
+ ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
if (!dst_maptype)
kunmap_atomic(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, nrpages_out);
+ vm_unmap_ram(dst, ctx.outpages);
return ret;
}
@@ -299,7 +320,8 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+ const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
+ PAGE_SIZE - rq->pageofs_out);
unsigned char *src, *dst;
if (nrpages_out > 2) {
@@ -312,7 +334,7 @@ static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
return 0;
}
- src = kmap_atomic(*rq->in);
+ src = kmap_atomic(*rq->in) + rq->pageofs_in;
if (rq->out[0]) {
dst = kmap_atomic(rq->out[0]);
memcpy(dst + rq->pageofs_out, src, righthalf);
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 50045510a1f4..05a3063cf2bc 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -156,7 +156,7 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
const unsigned int nrpages_in =
PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- unsigned int inputmargin, inlen, outlen, pageofs;
+ unsigned int inlen, outlen, pageofs;
struct z_erofs_lzma *strm;
u8 *kin;
bool bounced = false;
@@ -164,16 +164,13 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
/* 1. get the exact LZMA compressed size */
kin = kmap(*rq->in);
- inputmargin = 0;
- while (!kin[inputmargin & ~PAGE_MASK])
- if (!(++inputmargin & ~PAGE_MASK))
- break;
-
- if (inputmargin >= PAGE_SIZE) {
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ EROFS_BLKSIZ - rq->pageofs_in));
+ if (err) {
kunmap(*rq->in);
- return -EFSCORRUPTED;
+ return err;
}
- rq->inputsize -= inputmargin;
/* 2. get an available lzma context */
again:
@@ -193,9 +190,9 @@ again:
xz_dec_microlzma_reset(strm->state, inlen, outlen,
!rq->partial_decoding);
pageofs = rq->pageofs_out;
- strm->buf.in = kin + inputmargin;
+ strm->buf.in = kin + rq->pageofs_in;
strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin);
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
inlen -= strm->buf.in_size;
strm->buf.out = NULL;
strm->buf.out_pos = 0;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 083997a034e5..3ea62c6fb00a 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -17,19 +17,21 @@
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
* be incompatible with this kernel version.
*/
-#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
+#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
+#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
#define EROFS_ALL_FEATURE_INCOMPAT \
- (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
- EROFS_FEATURE_INCOMPAT_COMPR_HEAD2)
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
+ EROFS_FEATURE_INCOMPAT_ZTAILPACKING)
#define EROFS_SB_EXTSLOT_SIZE 16
@@ -209,7 +211,7 @@ struct erofs_xattr_ibody_header {
__le32 h_reserved;
__u8 h_shared_count;
__u8 h_reserved2[7];
- __le32 h_shared_xattrs[0]; /* shared xattr id array */
+ __le32 h_shared_xattrs[]; /* shared xattr id array */
};
/* Name indexes */
@@ -226,7 +228,7 @@ struct erofs_xattr_entry {
__u8 e_name_index; /* attribute name index */
__le16 e_value_size; /* size of attribute value */
/* followed by e_name and e_value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
@@ -292,13 +294,17 @@ struct z_erofs_lzma_cfgs {
* (4B) + 2B + (4B) if compacted 2B is on.
* bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
* bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
+ * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
+#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
struct z_erofs_map_header {
- __le32 h_reserved1;
+ __le16 h_reserved1;
+ /* indicates the encoded size of tailpacking data */
+ __le16 h_idata_size;
__le16 h_advise;
/*
* bit 0-3 : algorithm type of head 1 (logical cluster type 01);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 2345f1de438e..ff62f84f47d3 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -13,8 +13,8 @@
* the inode payload page if it's an extended inode) in order to fill
* inline data if possible.
*/
-static struct page *erofs_read_inode(struct inode *inode,
- unsigned int *ofs)
+static void *erofs_read_inode(struct erofs_buf *buf,
+ struct inode *inode, unsigned int *ofs)
{
struct super_block *sb = inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -22,7 +22,7 @@ static struct page *erofs_read_inode(struct inode *inode,
const erofs_off_t inode_loc = iloc(sbi, vi->nid);
erofs_blk_t blkaddr, nblks = 0;
- struct page *page;
+ void *kaddr;
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
unsigned int ifmt;
@@ -34,14 +34,14 @@ static struct page *erofs_read_inode(struct inode *inode,
erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
__func__, vi->nid, *ofs, blkaddr);
- page = erofs_get_meta_page(sb, blkaddr);
- if (IS_ERR(page)) {
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(page));
- return page;
+ vi->nid, PTR_ERR(kaddr));
+ return kaddr;
}
- dic = page_address(page) + *ofs;
+ dic = kaddr + *ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
@@ -62,12 +62,12 @@ static struct page *erofs_read_inode(struct inode *inode,
switch (erofs_inode_version(ifmt)) {
case EROFS_INODE_LAYOUT_EXTENDED:
vi->inode_isize = sizeof(struct erofs_inode_extended);
- /* check if the inode acrosses page boundary */
- if (*ofs + vi->inode_isize <= PAGE_SIZE) {
+ /* check if the extended inode acrosses block boundary */
+ if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) {
*ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
} else {
- const unsigned int gotten = PAGE_SIZE - *ofs;
+ const unsigned int gotten = EROFS_BLKSIZ - *ofs;
copied = kmalloc(vi->inode_isize, GFP_NOFS);
if (!copied) {
@@ -75,18 +75,16 @@ static struct page *erofs_read_inode(struct inode *inode,
goto err_out;
}
memcpy(copied, dic, gotten);
- unlock_page(page);
- put_page(page);
-
- page = erofs_get_meta_page(sb, blkaddr + 1);
- if (IS_ERR(page)) {
- erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld",
- vi->nid, PTR_ERR(page));
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
+ vi->nid, PTR_ERR(kaddr));
kfree(copied);
- return page;
+ return kaddr;
}
*ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, page_address(page), *ofs);
+ memcpy((u8 *)copied + gotten, kaddr, *ofs);
die = copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
@@ -200,7 +198,7 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
else
inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
- return page;
+ return kaddr;
bogusimode:
erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
@@ -209,12 +207,11 @@ bogusimode:
err_out:
DBG_BUGON(1);
kfree(copied);
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(buf);
return ERR_PTR(err);
}
-static int erofs_fill_symlink(struct inode *inode, void *data,
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
unsigned int m_pofs)
{
struct erofs_inode *vi = EROFS_I(inode);
@@ -222,7 +219,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
/* if it cannot be handled with fast symlink scheme */
if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= PAGE_SIZE) {
+ inode->i_size >= EROFS_BLKSIZ) {
inode->i_op = &erofs_symlink_iops;
return 0;
}
@@ -232,8 +229,8 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
return -ENOMEM;
m_pofs += vi->xattr_isize;
- /* inline symlink data shouldn't cross page boundary as well */
- if (m_pofs + inode->i_size > PAGE_SIZE) {
+ /* inline symlink data shouldn't cross block boundary */
+ if (m_pofs + inode->i_size > EROFS_BLKSIZ) {
kfree(lnk);
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
@@ -241,8 +238,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
-
- memcpy(lnk, data + m_pofs, inode->i_size);
+ memcpy(lnk, kaddr + m_pofs, inode->i_size);
lnk[inode->i_size] = '\0';
inode->i_link = lnk;
@@ -253,16 +249,17 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
static int erofs_fill_inode(struct inode *inode, int isdir)
{
struct erofs_inode *vi = EROFS_I(inode);
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *kaddr;
unsigned int ofs;
int err = 0;
trace_erofs_fill_inode(inode, isdir);
/* read inode base data from disk */
- page = erofs_read_inode(inode, &ofs);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ kaddr = erofs_read_inode(&buf, inode, &ofs);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
/* setup the new inode */
switch (inode->i_mode & S_IFMT) {
@@ -278,7 +275,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
inode->i_fop = &erofs_dir_fops;
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, page_address(page), ofs);
+ err = erofs_fill_symlink(inode, kaddr, ofs);
if (err)
goto out_unlock;
inode_nohighmem(inode);
@@ -302,8 +299,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
inode->i_mapping->a_ops = &erofs_raw_access_aops;
out_unlock:
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return err;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 3265688af7f9..3db494a398b2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -56,12 +56,18 @@ struct erofs_device_info {
u32 mapped_blkaddr;
};
+enum {
+ EROFS_SYNC_DECOMPRESS_AUTO,
+ EROFS_SYNC_DECOMPRESS_FORCE_ON,
+ EROFS_SYNC_DECOMPRESS_FORCE_OFF
+};
+
struct erofs_mount_opts {
#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
- /* strategy of sync decompression (false - auto, true - force on) */
- bool readahead_sync_decompress;
+ /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
+ unsigned int sync_decompress;
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
@@ -134,6 +140,10 @@ struct erofs_sb_info {
u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
+
+ /* sysfs support */
+ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
+ struct completion s_kobj_unregister;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -241,6 +251,19 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
#error erofs cannot be used in this platform
#endif
+enum erofs_kmap_type {
+ EROFS_NO_KMAP, /* don't map the buffer */
+ EROFS_KMAP, /* use kmap() to map the buffer */
+ EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
+};
+
+struct erofs_buf {
+ struct page *page;
+ void *base;
+ enum erofs_kmap_type kmap_type;
+};
+#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
+
#define ROOT_NID(sb) ((sb)->root_nid)
#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
@@ -258,10 +281,13 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
return sbi->feature_##compat & EROFS_FEATURE_##feature; \
}
-EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING)
EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
+EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
+EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -296,6 +322,9 @@ struct erofs_inode {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
+ unsigned long z_tailextent_headlcn;
+ unsigned int z_idataoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -390,14 +419,14 @@ enum {
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
struct erofs_map_blocks {
+ struct erofs_buf buf;
+
erofs_off_t m_pa, m_la;
u64 m_plen, m_llen;
unsigned short m_deviceid;
char m_algorithmformat;
unsigned int m_flags;
-
- struct page *mpage;
};
/* Flags used by erofs_map_blocks_flatmode() */
@@ -409,6 +438,8 @@ struct erofs_map_blocks {
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
/* Used to map the whole extent if non-negligible data is requested for LZMA */
#define EROFS_GET_BLOCKS_READMORE 0x0004
+/* Used to map tail extent for tailpacking inline pcluster */
+#define EROFS_GET_BLOCKS_FINDTAIL 0x0008
enum {
Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
@@ -443,7 +474,10 @@ struct erofs_map_dev {
/* data.c */
extern const struct file_operations erofs_file_fops;
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+void erofs_unmap_metabuf(struct erofs_buf *buf);
+void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -498,6 +532,12 @@ int erofs_pcpubuf_growsize(unsigned int nrpages);
void erofs_pcpubuf_init(void);
void erofs_pcpubuf_exit(void);
+/* sysfs.c */
+int erofs_register_sysfs(struct super_block *sb);
+void erofs_unregister_sysfs(struct super_block *sb);
+int __init erofs_init_sysfs(void);
+void erofs_exit_sysfs(void);
+
/* utils.c / zdata.c */
struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
static inline void erofs_pagepool_add(struct page **pagepool,
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 6a969b1e0ee6..5c137647fa8a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include <linux/module.h>
#include <linux/buffer_head.h>
@@ -124,80 +125,50 @@ static bool check_layout_compatibility(struct super_block *sb,
#ifdef CONFIG_EROFS_FS_ZIP
/* read variable-sized metadata, offset will be aligned by 4-byte */
-static void *erofs_read_metadata(struct super_block *sb, struct page **pagep,
+static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp)
{
- struct page *page = *pagep;
u8 *buffer, *ptr;
int len, i, cnt;
- erofs_blk_t blk;
*offset = round_up(*offset, 4);
- blk = erofs_blknr(*offset);
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return ptr;
- if (!page || page->index != blk) {
- if (page) {
- unlock_page(page);
- put_page(page);
- }
- page = erofs_get_meta_page(sb, blk);
- if (IS_ERR(page))
- goto err_nullpage;
- }
-
- ptr = kmap(page);
len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
if (!len)
len = U16_MAX + 1;
buffer = kmalloc(len, GFP_KERNEL);
- if (!buffer) {
- buffer = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
*offset += sizeof(__le16);
*lengthp = len;
for (i = 0; i < len; i += cnt) {
cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
- blk = erofs_blknr(*offset);
-
- if (!page || page->index != blk) {
- if (page) {
- kunmap(page);
- unlock_page(page);
- put_page(page);
- }
- page = erofs_get_meta_page(sb, blk);
- if (IS_ERR(page)) {
- kfree(buffer);
- goto err_nullpage;
- }
- ptr = kmap(page);
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset),
+ EROFS_KMAP);
+ if (IS_ERR(ptr)) {
+ kfree(buffer);
+ return ptr;
}
memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
*offset += cnt;
}
-out:
- kunmap(page);
- *pagep = page;
return buffer;
-err_nullpage:
- *pagep = NULL;
- return page;
}
static int erofs_load_compr_cfgs(struct super_block *sb,
struct erofs_super_block *dsb)
{
- struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
unsigned int algs, alg;
erofs_off_t offset;
- int size, ret;
+ int size, ret = 0;
- sbi = EROFS_SB(sb);
sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
-
if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
@@ -205,20 +176,17 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
- page = NULL;
alg = 0;
- ret = 0;
-
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
void *data;
if (!(algs & 1))
continue;
- data = erofs_read_metadata(sb, &page, &offset, &size);
+ data = erofs_read_metadata(sb, &buf, &offset, &size);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
- goto err;
+ break;
}
switch (alg) {
@@ -234,13 +202,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
kfree(data);
if (ret)
- goto err;
- }
-err:
- if (page) {
- unlock_page(page);
- put_page(page);
+ break;
}
+ erofs_put_metabuf(&buf);
return ret;
}
#else
@@ -261,7 +225,7 @@ static int erofs_init_devices(struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
unsigned int ondisk_extradevs;
erofs_off_t pos;
- struct page *page = NULL;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_device_info *dif;
struct erofs_deviceslot *dis;
void *ptr;
@@ -285,22 +249,13 @@ static int erofs_init_devices(struct super_block *sb,
pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
down_read(&sbi->devs->rwsem);
idr_for_each_entry(&sbi->devs->tree, dif, id) {
- erofs_blk_t blk = erofs_blknr(pos);
struct block_device *bdev;
- if (!page || page->index != blk) {
- if (page) {
- kunmap(page);
- unlock_page(page);
- put_page(page);
- }
-
- page = erofs_get_meta_page(sb, blk);
- if (IS_ERR(page)) {
- up_read(&sbi->devs->rwsem);
- return PTR_ERR(page);
- }
- ptr = kmap(page);
+ ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
+ EROFS_KMAP);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ break;
}
dis = ptr + erofs_blkoff(pos);
@@ -309,7 +264,7 @@ static int erofs_init_devices(struct super_block *sb,
sb->s_type);
if (IS_ERR(bdev)) {
err = PTR_ERR(bdev);
- goto err_out;
+ break;
}
dif->bdev = bdev;
dif->dax_dev = fs_dax_get_by_bdev(bdev);
@@ -318,13 +273,8 @@ static int erofs_init_devices(struct super_block *sb,
sbi->total_blocks += dif->blocks;
pos += EROFS_DEVT_SLOT_SIZE;
}
-err_out:
up_read(&sbi->devs->rwsem);
- if (page) {
- kunmap(page);
- unlock_page(page);
- put_page(page);
- }
+ erofs_put_metabuf(&buf);
return err;
}
@@ -411,6 +361,9 @@ static int erofs_read_superblock(struct super_block *sb)
/* handle multiple devices */
ret = erofs_init_devices(sb, dsb);
+
+ if (erofs_sb_has_ztailpacking(sbi))
+ erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
out:
kunmap(page);
put_page(page);
@@ -423,7 +376,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
#ifdef CONFIG_EROFS_FS_ZIP
ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
ctx->opt.max_sync_decompress_pages = 3;
- ctx->opt.readahead_sync_decompress = false;
+ ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
set_opt(&ctx->opt, XATTR_USER);
@@ -695,6 +648,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ err = erofs_register_sysfs(sb);
+ if (err)
+ return err;
+
erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
return 0;
}
@@ -808,6 +765,7 @@ static void erofs_put_super(struct super_block *sb)
DBG_BUGON(!sbi);
+ erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
@@ -852,6 +810,10 @@ static int __init erofs_module_init(void)
if (err)
goto zip_err;
+ err = erofs_init_sysfs();
+ if (err)
+ goto sysfs_err;
+
err = register_filesystem(&erofs_fs_type);
if (err)
goto fs_err;
@@ -859,6 +821,8 @@ static int __init erofs_module_init(void)
return 0;
fs_err:
+ erofs_exit_sysfs();
+sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
z_erofs_lzma_exit();
@@ -877,6 +841,7 @@ static void __exit erofs_module_exit(void)
/* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+ erofs_exit_sysfs();
z_erofs_exit_zip_subsystem();
z_erofs_lzma_exit();
erofs_exit_shrinker();
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
new file mode 100644
index 000000000000..dac252bc9228
--- /dev/null
+++ b/fs/erofs/sysfs.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd.
+ * https://www.oppo.com/
+ */
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+
+#include "internal.h"
+
+enum {
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_bool,
+};
+
+enum {
+ struct_erofs_sb_info,
+ struct_erofs_mount_opts,
+};
+
+struct erofs_attr {
+ struct attribute attr;
+ short attr_id;
+ int struct_type, offset;
+};
+
+#define EROFS_ATTR(_name, _mode, _id) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name)
+#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature)
+
+#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .struct_type = struct_##_struct, \
+ .offset = offsetof(struct _struct, _name),\
+}
+
+#define EROFS_ATTR_RW(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct)
+
+#define EROFS_RO_ATTR(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct)
+
+#define EROFS_ATTR_RW_UI(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_ui, _struct)
+
+#define EROFS_ATTR_RW_BOOL(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_bool, _struct)
+
+#define ATTR_LIST(name) (&erofs_attr_##name.attr)
+
+#ifdef CONFIG_EROFS_FS_ZIP
+EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+#endif
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP
+ ATTR_LIST(sync_decompress),
+#endif
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs);
+
+/* Features this copy of erofs supports */
+EROFS_ATTR_FEATURE(zero_padding);
+EROFS_ATTR_FEATURE(compr_cfgs);
+EROFS_ATTR_FEATURE(big_pcluster);
+EROFS_ATTR_FEATURE(chunked_file);
+EROFS_ATTR_FEATURE(device_table);
+EROFS_ATTR_FEATURE(compr_head2);
+EROFS_ATTR_FEATURE(sb_chksum);
+EROFS_ATTR_FEATURE(ztailpacking);
+
+static struct attribute *erofs_feat_attrs[] = {
+ ATTR_LIST(zero_padding),
+ ATTR_LIST(compr_cfgs),
+ ATTR_LIST(big_pcluster),
+ ATTR_LIST(chunked_file),
+ ATTR_LIST(device_table),
+ ATTR