aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile28
-rw-r--r--fs/btrfs/acl.c47
-rw-r--r--fs/btrfs/async-thread.c130
-rw-r--r--fs/btrfs/async-thread.h9
-rw-r--r--fs/btrfs/backref.c1531
-rw-r--r--fs/btrfs/backref.h331
-rw-r--r--fs/btrfs/block-group.c2212
-rw-r--r--fs/btrfs/block-group.h106
-rw-r--r--fs/btrfs/block-rsv.c213
-rw-r--r--fs/btrfs/block-rsv.h41
-rw-r--r--fs/btrfs/btrfs_inode.h266
-rw-r--r--fs/btrfs/check-integrity.c650
-rw-r--r--fs/btrfs/check-integrity.h8
-rw-r--r--fs/btrfs/compression.c1147
-rw-r--r--fs/btrfs/compression.h102
-rw-r--r--fs/btrfs/ctree.c2618
-rw-r--r--fs/btrfs/ctree.h1725
-rw-r--r--fs/btrfs/delalloc-space.c346
-rw-r--r--fs/btrfs/delalloc-space.h13
-rw-r--r--fs/btrfs/delayed-inode.c954
-rw-r--r--fs/btrfs/delayed-inode.h50
-rw-r--r--fs/btrfs/delayed-ref.c141
-rw-r--r--fs/btrfs/delayed-ref.h80
-rw-r--r--fs/btrfs/dev-replace.c411
-rw-r--r--fs/btrfs/dev-replace.h7
-rw-r--r--fs/btrfs/dir-item.c166
-rw-r--r--fs/btrfs/discard.c115
-rw-r--r--fs/btrfs/discard.h5
-rw-r--r--fs/btrfs/disk-io.c3582
-rw-r--r--fs/btrfs/disk-io.h117
-rw-r--r--fs/btrfs/export.c48
-rw-r--r--fs/btrfs/export.h5
-rw-r--r--fs/btrfs/extent-io-tree.c1674
-rw-r--r--fs/btrfs/extent-io-tree.h175
-rw-r--r--fs/btrfs/extent-tree.c2096
-rw-r--r--fs/btrfs/extent_io.c6717
-rw-r--r--fs/btrfs/extent_io.h222
-rw-r--r--fs/btrfs/extent_map.c375
-rw-r--r--fs/btrfs/extent_map.h16
-rw-r--r--fs/btrfs/file-item.c770
-rw-r--r--fs/btrfs/file.c2659
-rw-r--r--fs/btrfs/free-space-cache.c1294
-rw-r--r--fs/btrfs/free-space-cache.h37
-rw-r--r--fs/btrfs/free-space-tree.c110
-rw-r--r--fs/btrfs/inode-item.c350
-rw-r--r--fs/btrfs/inode-item.h96
-rw-r--r--fs/btrfs/inode-map.c581
-rw-r--r--fs/btrfs/inode-map.h16
-rw-r--r--fs/btrfs/inode.c8359
-rw-r--r--fs/btrfs/ioctl.c3857
-rw-r--r--fs/btrfs/locking.c693
-rw-r--r--fs/btrfs/locking.h124
-rw-r--r--fs/btrfs/lzo.c531
-rw-r--r--fs/btrfs/misc.h91
-rw-r--r--fs/btrfs/ordered-data.c830
-rw-r--r--fs/btrfs/ordered-data.h139
-rw-r--r--fs/btrfs/print-tree.c78
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/props.c134
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c823
-rw-r--r--fs/btrfs/qgroup.h43
-rw-r--r--fs/btrfs/raid56.c1677
-rw-r--r--fs/btrfs/raid56.h177
-rw-r--r--fs/btrfs/rcu-string.h2
-rw-r--r--fs/btrfs/reada.c1014
-rw-r--r--fs/btrfs/ref-verify.c108
-rw-r--r--fs/btrfs/reflink.c930
-rw-r--r--fs/btrfs/reflink.h12
-rw-r--r--fs/btrfs/relocation.c3118
-rw-r--r--fs/btrfs/root-tree.c114
-rw-r--r--fs/btrfs/scrub.c3178
-rw-r--r--fs/btrfs/send.c2704
-rw-r--r--fs/btrfs/send.h203
-rw-r--r--fs/btrfs/space-info.c1390
-rw-r--r--fs/btrfs/space-info.h52
-rw-r--r--fs/btrfs/struct-funcs.c222
-rw-r--r--fs/btrfs/subpage.c768
-rw-r--r--fs/btrfs/subpage.h158
-rw-r--r--fs/btrfs/super.c864
-rw-r--r--fs/btrfs/sysfs.c1154
-rw-r--r--fs/btrfs/sysfs.h18
-rw-r--r--fs/btrfs/tests/btrfs-tests.c63
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c23
-rw-r--r--fs/btrfs/tests/extent-io-tests.c127
-rw-r--r--fs/btrfs/tests/extent-map-tests.c6
-rw-r--r--fs/btrfs/tests/free-space-tests.c203
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c7
-rw-r--r--fs/btrfs/tests/inode-tests.c44
-rw-r--r--fs/btrfs/tests/qgroup-tests.c77
-rw-r--r--fs/btrfs/transaction.c946
-rw-r--r--fs/btrfs/transaction.h72
-rw-r--r--fs/btrfs/tree-checker.c587
-rw-r--r--fs/btrfs/tree-checker.h1
-rw-r--r--fs/btrfs/tree-defrag.c16
-rw-r--r--fs/btrfs/tree-log.c4830
-rw-r--r--fs/btrfs/tree-log.h66
-rw-r--r--fs/btrfs/tree-mod-log.c929
-rw-r--r--fs/btrfs/tree-mod-log.h53
-rw-r--r--fs/btrfs/uuid-tree.c66
-rw-r--r--fs/btrfs/verity.c812
-rw-r--r--fs/btrfs/volumes.c3746
-rw-r--r--fs/btrfs/volumes.h372
-rw-r--r--fs/btrfs/xattr.c105
-rw-r--r--fs/btrfs/zlib.c70
-rw-r--r--fs/btrfs/zoned.c2348
-rw-r--r--fs/btrfs/zoned.h420
-rw-r--r--fs/btrfs/zstd.c138
109 files changed, 51525 insertions, 32573 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..183e5c4aed34 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,9 +14,11 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
+ depends on PAGE_SIZE_LESS_THAN_256KB
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9a0ff3384381..fa9ddcc9eb0b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,21 +1,43 @@
# SPDX-License-Identifier: GPL-2.0
+# Subset of W=1 warnings
+subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter
+subdir-ccflags-y += -Wmissing-declarations
+subdir-ccflags-y += -Wmissing-format-attribute
+subdir-ccflags-y += -Wmissing-prototypes
+subdir-ccflags-y += -Wold-style-definition
+subdir-ccflags-y += -Wmissing-include-dirs
+condflags := \
+ $(call cc-option, -Wunused-but-set-variable) \
+ $(call cc-option, -Wunused-const-variable) \
+ $(call cc-option, -Wpacked-not-aligned) \
+ $(call cc-option, -Wstringop-truncation)
+subdir-ccflags-y += $(condflags)
+# The following turn off the warnings enabled by -Wextra
+subdir-ccflags-y += -Wno-missing-field-initializers
+subdir-ccflags-y += -Wno-sign-compare
+subdir-ccflags-y += -Wno-type-limits
+subdir-ccflags-y += -Wno-shift-negative-value
+
obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
- file-item.o inode-item.o inode-map.o disk-io.o \
+ file-item.o inode-item.o disk-io.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o discard.o
+ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
+ subpage.o tree-mod-log.o extent-io-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index a0af1b952c4d..548d6a5477b4 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -16,13 +16,16 @@
#include "btrfs_inode.h"
#include "xattr.h"
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
const char *name;
char *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -52,8 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
return acl;
}
-static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct posix_acl *acl, int type)
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -107,13 +110,15 @@ out:
return ret;
}
-int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret;
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
- ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ ret = posix_acl_update_mode(mnt_userns, inode,
+ &inode->i_mode, &acl);
if (ret)
return ret;
}
@@ -122,35 +127,3 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
inode->i_mode = old_mode;
return ret;
}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- int ret = 0;
-
- /* this happens with subvols */
- if (!dir)
- return 0;
-
- ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (ret)
- return ret;
-
- if (default_acl) {
- ret = __btrfs_set_acl(trans, inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
-
- if (acl) {
- if (!ret)
- ret = __btrfs_set_acl(trans, inode, acl,
- ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- }
-
- if (!default_acl && !acl)
- cache_no_acl(inode);
- return ret;
-}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 1d32a07bb2d1..aac240430efe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -15,13 +15,12 @@
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
- WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
-struct __btrfs_workqueue {
+struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
@@ -48,12 +47,7 @@ struct __btrfs_workqueue {
spinlock_t thres_lock;
};
-struct btrfs_workqueue {
- struct __btrfs_workqueue *normal;
- struct __btrfs_workqueue *high;
-};
-
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
return wq->fs_info;
}
@@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
- * We could compare wq->normal->pending with num_online_cpus()
+ * We could compare wq->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
- if (wq->normal->thresh == NO_THRESHOLD)
+ if (wq->thresh == NO_THRESHOLD)
return false;
- return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+ return atomic_read(&wq->pending) > wq->thresh * 2;
}
-static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
- unsigned int flags, int limit_active, int thresh)
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name, unsigned int flags,
+ int limit_active, int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
ret->thresh = thresh;
}
- if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
- ret->current_active, name);
- else
- ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
- ret->current_active, name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
+ name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
@@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
- trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
- return ret;
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-
-struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
- const char *name,
- unsigned int flags,
- int limit_active,
- int thresh)
-{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
-
- if (!ret)
- return NULL;
-
- ret->normal = __btrfs_alloc_workqueue(fs_info, name,
- flags & ~WQ_HIGHPRI,
- limit_active, thresh);
- if (!ret->normal) {
- kfree(ret);
- return NULL;
- }
-
- if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
- limit_active, thresh);
- if (!ret->high) {
- __btrfs_destroy_workqueue(ret->normal);
- kfree(ret);
- return NULL;
- }
- }
+ trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
@@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
-static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
@@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
-static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
@@ -217,7 +173,7 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq,
+static void run_ordered_work(struct btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
@@ -234,6 +190,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -298,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq = work->wq;
int need_order = 0;
/*
@@ -311,12 +274,18 @@ static void btrfs_work_helper(struct work_struct *normal_work)
*/
if (work->ordered_func)
need_order = 1;
- wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
} else {
@@ -336,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
work->flags = 0;
}
-static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
- struct btrfs_work *work)
+void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
unsigned long flags;
@@ -352,46 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
queue_work(wq->normal_wq, &work->normal_work);
}
-void btrfs_queue_work(struct btrfs_workqueue *wq,
- struct btrfs_work *work)
-{
- struct __btrfs_workqueue *dest_wq;
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
- dest_wq = wq->high;
- else
- dest_wq = wq->normal;
- __btrfs_queue_work(dest_wq, work);
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
-{
- destroy_workqueue(wq->normal_wq);
- trace_btrfs_workqueue_destroy(wq);
- kfree(wq);
-}
-
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
- if (wq->high)
- __btrfs_destroy_workqueue(wq->high);
- __btrfs_destroy_workqueue(wq->normal);
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
- if (!wq)
- return;
- wq->normal->limit_active = limit_active;
- if (wq->high)
- wq->high->limit_active = limit_active;
+ if (wq)
+ wq->limit_active = limit_active;
}
-void btrfs_set_work_high_priority(struct btrfs_work *work)
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ flush_workqueue(wq->normal_wq);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index a4434301d84d..6e2596ddae10 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -11,11 +11,8 @@
struct btrfs_fs_info;
struct btrfs_workqueue;
-/* Internal use only */
-struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
@@ -25,7 +22,7 @@ struct btrfs_work {
/* Don't touch things below */
struct work_struct normal_work;
struct list_head ordered_list;
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq;
unsigned long flags;
};
@@ -40,9 +37,9 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
-void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e5d85311d5d5..18374a6d05bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -13,6 +13,8 @@
#include "transaction.h"
#include "delayed-ref.h"
#include "locking.h"
+#include "misc.h"
+#include "tree-mod-log.h"
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -136,6 +138,7 @@ struct share_check {
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -286,8 +289,10 @@ static void prelim_release(struct preftree *preftree)
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -347,33 +352,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
return -ENOMEM;
ref->root_id = root_id;
- if (key) {
+ if (key)
ref->key_for_search = *key;
- /*
- * We can often find data backrefs with an offset that is too
- * large (>= LLONG_MAX, maximum allowed file offset) due to
- * underflows when subtracting a file's offset with the data
- * offset of its corresponding extent data item. This can
- * happen for example in the clone ioctl.
- * So if we detect such case we set the search key's offset to
- * zero to make sure we will find the matching file extent item
- * at add_all_parents(), otherwise we will miss it because the
- * offset taken form the backref is much larger then the offset
- * of the file extent item. This can make us scan a very large
- * number of file extent items, but at least it will not make
- * us miss any.
- * This is an ugly workaround for a behaviour that should have
- * never existed, but it does and a fix for the clone ioctl
- * would touch a lot of places, cause backwards incompatibility
- * and would not fix the problem for extents cloned with older
- * kernels.
- */
- if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
- ref->key_for_search.offset >= LLONG_MAX)
- ref->key_for_search.offset = 0;
- } else {
+ else
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
- }
ref->inode_list = NULL;
ref->level = level;
@@ -409,10 +391,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
wanted_disk_byte, count, sc, gfp_mask);
}
+static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
+{
+ struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct prelim_ref *ref = NULL;
+ struct prelim_ref target = {};
+ int result;
+
+ target.parent = bytenr;
+
+ while (*p) {
+ parent = *p;
+ ref = rb_entry(parent, struct prelim_ref, rbnode);
+ result = prelim_ref_compare(ref, &target);
+
+ if (result < 0)
+ p = &(*p)->rb_left;
+ else if (result > 0)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
- struct ulist *parents, struct prelim_ref *ref,
+ struct ulist *parents,
+ struct preftrees *preftrees, struct prelim_ref *ref,
int level, u64 time_seq, const u64 *extent_item_pos,
- u64 total_refs, bool ignore_offset)
+ bool ignore_offset)
{
int ret = 0;
int slot;
@@ -424,6 +432,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
u64 disk_byte;
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
+ u64 data_offset;
if (level != 0) {
eb = path->nodes[level];
@@ -434,18 +443,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
/*
- * We normally enter this function with the path already pointing to
- * the first item to check. But sometimes, we may enter it with
- * slot==nritems. In that case, go to the next leaf before we continue.
+ * 1. We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot == nritems.
+ * 2. We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref
+ * 3. The leaf owner is not equal to the root we are searching
+ *
+ * For these cases, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- if (time_seq == SEQ_LAST)
+ eb = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(eb) ||
+ is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb)) {
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
}
- while (!ret && count < total_refs) {
+ while (!ret && count < ref->count) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -455,13 +472,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
key.type != BTRFS_EXTENT_DATA_KEY)
break;
+ /*
+ * We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref, OR
+ * the leaf owner is not equal to the root we are searching for
+ */
+ if (slot == 0 &&
+ (is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb))) {
+ if (time_seq == BTRFS_SEQ_LAST)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ continue;
+ }
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ data_offset = btrfs_file_extent_offset(eb, fi);
if (disk_byte == wanted_disk_byte) {
eie = NULL;
old = NULL;
- count++;
+ if (ref->key_for_search.offset == key.offset - data_offset)
+ count++;
+ else
+ goto next;
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
@@ -483,7 +518,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -502,59 +537,82 @@ next:
*/
static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
+ struct preftrees *preftrees,
struct prelim_ref *ref, struct ulist *parents,
- const u64 *extent_item_pos, u64 total_refs,
- bool ignore_offset)
+ const u64 *extent_item_pos, bool ignore_offset)
{
struct btrfs_root *root;
- struct btrfs_key root_key;
struct extent_buffer *eb;
int ret = 0;
int root_level;
int level = ref->level;
- int index;
+ struct btrfs_key search_key = ref->key_for_search;
- root_key.objectid = ref->root_id;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_get_fs_root(fs_info, &root_key, false);
+ /*
+ * If we're search_commit_root we could possibly be holding locks on
+ * other tree nodes. This happens when qgroups does backref walks when
+ * adding new delayed refs. To deal with this we need to look in cache
+ * for the root, and if we don't find it then we need to search the
+ * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
+ * here.
+ */
+ if (path->search_commit_root)
+ root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
+ else
+ root = btrfs_get_fs_root(fs_info, ref->root_id, false);
if (IS_ERR(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
+ goto out_free;
+ }
+
+ if (!path->search_commit_root &&
+ test_bit(BTRFS_ROOT_DELETING, &root->state)) {
+ ret = -ENOENT;
goto out;
}
if (btrfs_is_testing(fs_info)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -ENOENT;
goto out;
}
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == SEQ_LAST)
+ else if (time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
- if (root_level + 1 == level) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (root_level + 1 == level)
goto out;
- }
+ /*
+ * We can often find data backrefs with an offset that is too large
+ * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
+ * subtracting a file's offset with the data offset of its
+ * corresponding extent data item. This can happen for example in the
+ * clone ioctl.
+ *
+ * So if we detect such case we set the search key's offset to zero to
+ * make sure we will find the matching file extent item at
+ * add_all_parents(), otherwise we will miss it because the offset
+ * taken form the backref is much larger then the offset of the file
+ * extent item. This can make us scan a very large number of file
+ * extent items, but at least it will not make us miss any.
+ *
+ * This is an ugly workaround for a behaviour that should have never
+ * existed, but it does and a fix for the clone ioctl would touch a lot
+ * of places, cause backwards incompatibility and would not fix the
+ * problem for extents cloned with older kernels.
+ */
+ if (search_key.type == BTRFS_EXTENT_DATA_KEY &&
+ search_key.offset >= LLONG_MAX)
+ search_key.offset = 0;
path->lowest_level = level;
- if (time_seq == SEQ_LAST)
- ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
- 0, 0);
+ if (time_seq == BTRFS_SEQ_LAST)
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
- time_seq);
-
- /* root node has been locked, we can release @subvol_srcu safely here */
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
btrfs_debug(fs_info,
"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
@@ -574,9 +632,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, ref, level, time_seq,
- extent_item_pos, total_refs, ignore_offset);
+ ret = add_all_parents(root, path, parents, preftrees, ref, level,
+ time_seq, extent_item_pos, ignore_offset);
out:
+ btrfs_put_root(root);
+out_free:
path->lowest_level = 0;
btrfs_release_path(path);
return ret;
@@ -590,6 +650,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -609,7 +681,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct preftrees *preftrees,
- const u64 *extent_item_pos, u64 total_refs,
+ const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
int err;
@@ -653,9 +725,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(fs_info, path, time_seq, ref,
- parents, extent_item_pos,
- total_refs, ignore_offset);
+ err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
+ ref, parents, extent_item_pos,
+ ignore_offset);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -704,7 +776,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -727,16 +803,18 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
- ref->level - 1, NULL);
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte,
+ ref->root_id, 0, ref->level - 1, NULL);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
}
+
if (lock)
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
@@ -758,20 +836,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head, u64 seq,
- struct preftrees *preftrees, u64 *total_refs,
- struct share_check *sc)
+ struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -793,15 +865,20 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
default:
BUG();
}
- *total_refs += count;
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -827,13 +904,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -863,7 +949,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -876,7 +962,7 @@ out:
static int add_inline_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct preftrees *preftrees,
- u64 *total_refs, struct share_check *sc)
+ struct share_check *sc)
{
int ret = 0;
int slot;
@@ -895,12 +981,11 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
- *total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@@ -967,7 +1052,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -977,6 +1063,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -995,12 +1082,12 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
*
* Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED.
*/
-static int add_keyed_refs(struct btrfs_fs_info *fs_info,
+static int add_keyed_refs(struct btrfs_root *extent_root,
struct btrfs_path *path, u64 bytenr,
int info_level, struct preftrees *preftrees,
struct share_check *sc)
{
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
int ret;
int slot;
struct extent_buffer *leaf;
@@ -1066,7 +1153,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1094,8 +1182,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
- * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
- * much like trans == NULL case, the difference only lies in it will not
+ * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
+ * behave much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
*
@@ -1116,6 +1204,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct ulist *roots, const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_delayed_ref_root *delayed_refs = NULL;
@@ -1125,8 +1214,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct prelim_ref *ref;
struct rb_node *node;
struct extent_inode_elem *eie = NULL;
- /* total of both direct AND indirect refs! */
- u64 total_refs = 0;
struct preftrees preftrees = {
.direct = PREFTREE_INIT,
.indirect = PREFTREE_INIT,
@@ -1148,31 +1235,29 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
- /*
- * grab both a lock on the path and a lock on the delayed ref head.
- * We need both to get a consistent picture of how the refs look
- * at a specified point in time
- */
again:
head = NULL;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != SEQ_LAST) {
-#else
- if (trans && time_seq != SEQ_LAST) {
-#endif
+ time_seq != BTRFS_SEQ_LAST) {
/*
- * look if there are updates for this ref queued and lock the
- * head
+ * We have a specific time_seq we care about and trans which
+ * means we have the path lock, we need to grab the ref head and
+ * lock it so we have a consistent view of the refs at the given
+ * time.
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -1195,7 +1280,7 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = add_delayed_refs(fs_info, head, time_seq,
- &preftrees, &total_refs, sc);
+ &preftrees, sc);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -1216,11 +1301,10 @@ again:
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = add_inline_refs(fs_info, path, bytenr,
- &info_level, &preftrees,
- &total_refs, sc);
+ &info_level, &preftrees, sc);
if (ret)
goto out;
- ret = add_keyed_refs(fs_info, path, bytenr, info_level,
+ ret = add_keyed_refs(root, path, bytenr, info_level,
&preftrees, sc);
if (ret)
goto out;
@@ -1236,7 +1320,7 @@ again:
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
- extent_item_pos, total_refs, sc, ignore_offset);
+ extent_item_pos, sc, ignore_offset);
if (ret)
goto out;
@@ -1281,28 +1365,33 @@ again:
struct extent_buffer *eb;
eb = read_tree_block(fs_info, ref->parent, 0,
- ref->level, NULL);
+ 0, ref->level, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
}
- if (!path->skip_locking) {
+ if (!path->skip_locking)
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
if (!path->skip_locking)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1311,15 +1400,31 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1336,24 +1441,6 @@ out:
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -1362,10 +1449,10 @@ static void free_leaf_list(struct ulist *blocks)
*
* returns 0 on success, <0 on error
*/
-static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset)
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset)
{
int ret;
@@ -1422,6 +1509,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
@@ -1438,23 +1526,150 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
- bool ignore_offset)
+ bool skip_commit_root_sem)
{
int ret;
- if (!trans)
+ if (!trans && !skip_commit_root_sem)
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
- time_seq, roots, ignore_offset);
- if (!trans)
+ time_seq, roots, false);
+ if (!trans && !skip_commit_root_sem)
up_read(&fs_info->commit_root_sem);
return ret;
}
-/**
- * btrfs_check_shared - tell us whether an extent is shared
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (!cache->use_cache)
+ return false;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &cache->entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+ /*
+ * If the node at this level is shared, than all nodes below are also
+ * shared. Currently some of the nodes below may be marked as not shared
+ * because we have just switched from one leaf to another, and switched
+ * also other nodes above the leaf and below the current level, so mark
+ * them as shared.
+ */
+ if (*is_shared) {
+ for (int i = 0; i < level; i++) {
+ cache->entries[i].is_shared = true;
+ cache->entries[i].gen = entry->gen;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (!cache->use_cache)
+ return;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &cache->entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &cache->entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
+ * Check if a data extent is shared or not.
*
- * btrfs_check_shared uses the backref walking code but will short
+ * @root: The root the inode belongs to.
+ * @inum: Number of the inode whose extent we are checking.
+ * @bytenr: Logical bytenr of the extent we are checking.
+ * @extent_gen: Generation of the extent (file extent item) or 0 if it is
+ * not known.
+ * @roots: List of roots this extent is shared among.
+ * @tmp: Temporary list used for iteration.
+ * @cache: A backref lookup result cache.
+ *
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
* one passed in. This provides a significant performance benefit for
* callers (such as fiemap) which want to know whether the extent is
@@ -1465,20 +1680,24 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp)
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
struct ulist_node *node;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
+ int level;
ulist_init(roots);
ulist_init(tmp);
@@ -1495,23 +1714,73 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
btrfs_get_tree_mod_seq(fs_info, &elem);
}
+ /* -1 means we are in the bytenr of the data extent. */
+ level = -1;
ULIST_ITER_INIT(&uiter);
+ cache->use_cache = true;
while (1) {
+ bool is_shared;
+ bool cached;
+
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
roots, NULL, &shared, false);
if (ret == BACKREF_FOUND_SHARED) {
/* this is the only condition under which we return 1 */
ret = 1;
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, true);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
+ /*
+ * If our data extent is not shared through reflinks and it was
+ * created in a generation after the last one used to create a
+ * snapshot of the inode's root, then it can not be shared
+ * indirectly through subtrees, as that can only happen with
+ * snapshots. In this case bail out, no need to check for the
+ * sharedness of extent buffers.
+ */
+ if (level == -1 &&
+ extent_gen > btrfs_root_last_snapshot(&root->root_item))
+ break;
+
+ /*
+ * If our data extent was not directly shared (without multiple
+ * reference items), than it might have a single reference item
+ * with a count > 1 for the same offset, which means there are 2
+ * (or more) file extent items that point to the data extent -
+ * this happens when a file extent item needs to be split and
+ * then one item gets moved to another leaf due to a b+tree leaf
+ * split when inserting some item. In this case the file extent
+ * items may be located in different leaves and therefore some
+ * of the leaves may be referenced through shared subtrees while
+ * others are not. Since our extent buffer cache only works for
+ * a single path (by far the most common case and simpler to
+ * deal with), we can not use it if we have multiple leaves
+ * (which implies multiple paths).
+ */
+ if (level == -1 && tmp->nnodes > 1)
+ cache->use_cache = false;
+
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, false);
node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
+ level++;
+ cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+ &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
@@ -1620,13 +1889,11 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
s64 bytes_left = ((s64)size) - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
- int leave_spinning = path->leave_spinning;
struct btrfs_inode_ref *iref;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
- path->leave_spinning = 1;
while (1) {
bytes_left -= name_len;
if (bytes_left >= 0)
@@ -1634,7 +1901,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
name_off, name_len);
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1654,8 +1921,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- if (!path->skip_locking)
- btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
@@ -1672,7 +1937,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
btrfs_release_path(path);
- path->leave_spinning = leave_spinning;
if (ret)
return ERR_PTR(ret);
@@ -1689,6 +1953,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags_ret)
{
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
int ret;
u64 flags;
u64 size = 0;
@@ -1704,11 +1969,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
key.objectid = logical;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -1728,7 +1993,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
BUG_ON(item_size < sizeof(*ei));
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
@@ -1903,7 +2168,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
@@ -1911,7 +2176,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
extent_item_objectid);
if (!search_commit_root) {
- trans = btrfs_attach_transaction(fs_info->extent_root);
+ trans = btrfs_attach_transaction(fs_info->tree_root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT &&
PTR_ERR(trans) != -EROFS)
@@ -1921,12 +2186,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
}
if (trans)
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_get_tree_mod_seq(fs_info, &seq_elem);
else
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- tree_mod_seq_elem.seq, &refs,
+ seq_elem.seq, &refs,
&extent_item_pos, ignore_offset);
if (ret)
goto out;
@@ -1934,7 +2199,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots,
+ seq_elem.seq, &roots,
ignore_offset);
if (ret)
break;
@@ -1957,7 +2222,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
if (trans) {
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_put_tree_mod_seq(fs_info, &seq_elem);
btrfs_end_transaction(trans);
} else {
up_read(&fs_info->commit_root_sem);
@@ -1966,10 +2231,29 @@ out:
return ret;
}
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+ void *ctx, bool ignore_offset)
{
int ret;
u64 extent_item_pos;
@@ -1987,17 +2271,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, search_commit_root,
- iterate, ctx, ignore_offset);
+ build_ino_list, ctx, ignore_offset);
return ret;
}
-typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx);
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath);
-static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
{
int ret = 0;
int slot;
@@ -2006,8 +2288,9 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u32 name_len;
u64 parent = 0;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2033,18 +2316,17 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item = btrfs_item_nr(slot);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
- for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+ for (cur = 0; cur < btrfs_item_size(eb, slot); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
fs_root->root_key.objectid);
- ret = iterate(parent, name_len,
- (unsigned long)(iref + 1), eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)(iref + 1), eb, ipath);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -2058,15 +2340,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath)
{
int ret;
int slot;
u64 offset = 0;
u64 parent;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
u32 item_size;
@@ -2092,7 +2374,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
}
btrfs_release_path(path);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr_offset(eb, slot);
cur_offset = 0;
@@ -2102,8 +2384,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
name_len = btrfs_inode_extref_name_len(eb, extref);
- ret = iterate(parent, name_len,
- (unsigned long)&extref->name, eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)&extref->name, eb, ipath);
if (ret)
break;
@@ -2120,34 +2402,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path, iterate_irefs_t *iterate,
- void *ctx)
-{
- int ret;
- int found_refs = 0;
-
- ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
- if (!ret)
- ++found_refs;
- else if (ret != -ENOENT)
- return ret;
-
- ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
- if (ret == -ENOENT && found_refs)
- return 0;
-
- return ret;
-}
-
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx)
+ struct extent_buffer *eb, struct inode_fs_paths *ipath)
{
- struct inode_fs_paths *ipath = ctx;
char *fspath;
char *fspath_min;
int i = ipath->fspath->elem_cnt;
@@ -2188,8 +2449,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
*/
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
- return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, ipath);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, ipath);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
@@ -2252,3 +2525,827 @@ void free_ipath(struct inode_fs_paths *ipath)
kvfree(ipath->fspath);
kfree(ipath);
}
+
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
+{
+ struct btrfs_backref_iter *ret;
+
+ ret = kzalloc(sizeof(*ret), gfp_flag);
+ if (!ret)
+ return NULL;
+
+ ret->path = btrfs_alloc_path();
+ if (!ret->path) {
+ kfree(ret);
+ return NULL;
+ }
+
+ /* Current backref iterator only supports iteration in commit root */
+ ret->path->search_commit_root = 1;
+ ret->path->skip_locking = 1;
+ ret->fs_info = fs_info;
+
+ return ret;
+}
+
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
+{
+ struct btrfs_fs_info *fs_info = iter->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
+ struct btrfs_path *path = iter->path;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ key.offset = (u64)-1;
+ iter->bytenr = bytenr;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ ret = -EUCLEAN;
+ goto release;
+ }
+ if (path->slots[0] == 0) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EUCLEAN;
+ goto release;
+ }
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if ((key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) {
+ ret = -ENOENT;
+ goto release;
+ }
+ memcpy(&iter->cur_key, &key, sizeof(key));
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->end_ptr = (u32)(iter->item_ptr +
+ btrfs_item_size(path->nodes[0], path->slots[0]));
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+
+ /*
+ * Only support iteration on tree backref yet.
+ *
+ * This is an extra precaution for non skinny-metadata, where
+ * EXTENT_ITEM is also used for tree blocks, that we can only use
+ * extent flags to determine if it's a tree block.
+ */
+ if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) {
+ ret = -ENOTSUPP;
+ goto release;
+ }
+ iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei));
+
+ /* If there is no inline backref, go search for keyed backref */
+ if (iter->cur_ptr >= iter->end_ptr) {
+ ret = btrfs_next_item(extent_root, path);
+
+ /* No inline nor keyed ref */
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto release;
+ }
+ if (ret < 0)
+ goto release;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key,
+ path->slots[0]);
+ if (iter->cur_key.objectid != bytenr ||
+ (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
+ iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) {
+ ret = -ENOENT;
+ goto release;
+ }
+ iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->item_ptr = iter->cur_ptr;
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size(
+ path->nodes[0], path->slots[0]));
+ }
+
+ return 0;
+release:
+ btrfs_backref_iter_release(iter);
+ return ret;
+}
+
+/*
+ * Go to the next backref item of current bytenr, can be either inlined or
+ * keyed.
+ *
+ * Caller needs to check whether it's inline ref or not by iter->cur_key.
+ *
+ * Return 0 if we get next backref without problem.
+ * Return >0 if there is no extra backref for this bytenr.
+ * Return <0 if there is something wrong happened.
+ */
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
+{
+ struct extent_buffer *eb = btrfs_backref_get_eb(iter);
+ struct btrfs_root *extent_root;
+ struct btrfs_path *path = iter->path;
+ struct btrfs_extent_inline_ref *iref;
+ int ret;
+ u32 size;
+
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
+ /* We're still inside the inline refs */
+ ASSERT(iter->cur_ptr < iter->end_ptr);
+
+ if (btrfs_backref_has_tree_block_info(iter)) {
+ /* First tree block info */
+ size = sizeof(struct btrfs_tree_block_info);
+ } else {
+ /* Use inline ref type to determine the size */
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)
+ ((unsigned long)iter->cur_ptr);
+ type = btrfs_extent_inline_ref_type(eb, iref);
+
+ size = btrfs_extent_inline_ref_size(type);
+ }
+ iter->cur_ptr += size;
+ if (iter->cur_ptr < iter->end_ptr)
+ return 0;
+
+ /* All inline items iterated, fall through */
+ }
+
+ /* We're at keyed items, there is no inline item, go to the next one */
+ extent_root = btrfs_extent_root(iter->fs_info, iter->bytenr);
+ ret = btrfs_next_item(extent_root, iter->path);
+ if (ret)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]);
+ if (iter->cur_key.objectid != iter->bytenr ||
+ (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+ iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY))
+ return 1;
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]);
+ iter->cur_ptr = iter->item_ptr;
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size(path->nodes[0],
+ path->slots[0]);
+ return 0;
+}
+
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_backref_cache *cache, int is_reloc)
+{
+ int i;
+
+ cache->rb_root = RB_ROOT;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&cache->pending[i]);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+ INIT_LIST_HEAD(&cache->pending_edge);
+ INIT_LIST_HEAD(&cache->useless_node);
+ cache->fs_info = fs_info;
+ cache->is_reloc = is_reloc;
+}
+
+struct btrfs_backref_node *btrfs_backref_alloc_node(
+ struct btrfs_backref_cache *cache, u64 bytenr, int level)
+{
+ struct btrfs_backref_node *node;
+
+ ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL);
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (!node)
+ return node;
+
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ node->level = level;
+ node->bytenr = bytenr;
+
+ return node;
+}
+
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
+ struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
+}
+
+/*
+ * Drop the backref node from cache, also cleaning up all its
+ * upper edges and any uncached nodes in the path.
+ *
+ * This cleanup happens bottom up, thus the node should either
+ * be the lowest node in the cache or a detached node.
+ */
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+
+ if (!node)
+ return;
+
+ BUG_ON(!node->lowest && !node->detached);
+ while (!list_empty(&node->upper)) {
+ edge = list_entry(node->upper.next, struct btrfs_backref_edge,
+ list[LOWER]);
+ upper = edge->node[UPPER];
+ list_del(&edge->list[LOWER]);
+ list_del(&edge->list[UPPER]);
+ btrfs_backref_free_edge(cache, edge);
+
+ /*
+ * Add the node to leaf node list if no other child block
+ * cached.
+ */
+ if (list_empty(&upper->lower)) {
+ list_add_tail(&upper->lower, &cache->leaves);
+ upper->lowest = 1;
+ }
+ }
+
+ btrfs_backref_drop_node(cache, node);
+}
+
+/*
+ * Release all nodes/edges from current cache
+ */
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
+{
+ struct btrfs_backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct btrfs_backref_node, list);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct btrfs_backref_node, lower);
+ btrfs_backref_cleanup_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ ASSERT(list_empty(&cache->pending[i]));
+ ASSERT(list_empty(&cache->pending_edge));
+ ASSERT(list_empty(&cache->useless_node));
+ ASSERT(list_empty(&cache->changed));
+ ASSERT(list_empty(&cache->detached));
+ ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
+ ASSERT(!cache->nr_nodes);
+ ASSERT(!cache->nr_edges);
+}
+
+/*
+ * Handle direct tree backref
+ *
+ * Direct tree backref means, the backref item shows its parent bytenr
+ * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
+ *
+ * @ref_key: The converted backref key.
+ * For keyed backref, it's the item key.
+ * For inlined backref, objectid is the bytenr,
+ * type is btrfs_inline_ref_type, offset is
+ * btrfs_inline_ref_offset.
+ */
+static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
+ struct btrfs_key *ref_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *upper;
+ struct rb_node *rb_node;
+
+ ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
+
+ /* Only reloc root uses backref pointing to itself */
+ if (ref_key->objectid == ref_key->offset) {
+ struct btrfs_root *root;
+
+ cur->is_reloc_root = 1;
+ /* Only reloc backref cache cares about a specific root */
+ if (cache->is_reloc) {
+ root = find_reloc_root(cache->fs_info, cur->bytenr);
+ if (!root)
+ return -ENOENT;
+ cur->root = root;
+ } else {
+ /*
+ * For generic purpose backref cache, reloc root node
+ * is useless.
+ */
+ list_add(&cur->list, &cache->useless_node);
+ }
+ return 0;
+ }
+
+ edge = btrfs_backref_alloc_edge(cache);
+ if (!edge)
+ return -ENOMEM;
+
+ rb_node = rb_simple_search(&cache->rb_root, ref_key->offset);
+ if (!rb_node) {
+ /* Parent node not yet cached */
+ upper = btrfs_backref_alloc_node(cache, ref_key->offset,
+ cur->level + 1);
+ if (!upper) {
+ btrfs_backref_free_edge(cache, edge);
+ return -ENOMEM;
+ }
+
+ /*
+ * Backrefs for the upper level block isn't cached, add the
+ * block to pending list
+ */
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
+ } else {
+ /* Parent node already cached */
+ upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ return 0;
+}
+
+/*
+ * Handle indirect tree backref
+ *
+ * Indirect tree backref means, we only know which tree the node belongs to.
+ * We still need to do a tree search to find out the parents. This is for
+ * TREE_BLOCK_REF backref (keyed or inlined).
+ *
+ * @ref_key: The same as @ref_key in handle_direct_tree_backref()
+ * @tree_key: The first key of this tree block.
+ * @path: A clean (released) path, to avoid allocating path every time
+ * the function get called.
+ */
+static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_key *ref_key,
+ struct btrfs_key *tree_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_node *lower;
+ struct btrfs_backref_edge *edge;
+ struct extent_buffer *eb;
+ struct btrfs_root *root;
+ struct rb_node *rb_node;
+ int level;
+ bool need_check = true;
+ int ret;
+
+ root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ cur->cowonly = 1;
+
+ if (btrfs_root_level(&root->root_item) == cur->level) {
+ /* Tree root */
+ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
+ /*
+ * For reloc backref cache, we may ignore reloc root. But for
+ * general purpose backref cache, we can't rely on
+ * btrfs_should_ignore_reloc_root() as it may conflict with
+ * current running relocation and lead to missing root.
+ *
+ * For general purpose backref cache, reloc root detection is
+ * completely relying on direct backref (key->offset is parent
+ * bytenr), thus only do such check for reloc cache.
+ */
+ if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) {
+ btrfs_put_root(root);
+ list_add(&cur->list, &cache->useless_node);
+ } else {
+ cur->root = root;
+ }
+ return 0;
+ }
+
+ level = cur->level + 1;
+
+ /* Search the tree to find parent blocks referring to the block */
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ btrfs_put_root(root);
+ return ret;
+ }
+ if (ret > 0 && path->slots[level] > 0)
+ path->slots[level]--;
+
+ eb = path->nodes[level];
+ if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
+ btrfs_err(fs_info,
+"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
+ cur->bytenr, level - 1, root->root_key.objectid,
+ tree_key->objectid, tree_key->type, tree_key->offset);
+ btrfs_put_root(root);
+ ret = -ENOENT;
+ goto out;
+ }
+ lower = cur;
+
+ /* Add all nodes and edges in the path */
+ for (; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level]) {
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
+ lower->bytenr);
+ /* Same as previous should_ignore_reloc_root() call */
+ if (btrfs_should_ignore_reloc_root(root) &&
+ cache->is_reloc) {
+ btrfs_put_root(root);
+ list_add(&lower->list, &cache->useless_node);
+ } else {
+ lower->root = root;
+ }
+ break;
+ }
+
+ edge = btrfs_backref_alloc_edge(cache);
+ if (!edge) {
+ btrfs_put_root(root);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ eb = path->nodes[level];
+ rb_node = rb_simple_search(&cache->rb_root, eb->start);
+ if (!rb_node) {
+ upper = btrfs_backref_alloc_node(cache, eb->start,
+ lower->level + 1);
+ if (!upper) {
+ btrfs_put_root(root);
+ btrfs_backref_free_edge(cache, edge);
+ ret = -ENOMEM;
+ goto out;
+ }
+ upper->owner = btrfs_header_owner(eb);
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ upper->cowonly = 1;
+
+ /*
+ * If we know the block isn't shared we can avoid
+ * checking its backrefs.
+ */
+ if (btrfs_block_can_be_shared(root, eb))
+ upper->checked = 0;
+ else
+ upper->checked = 1;
+
+ /*
+ * Add the block to pending list if we need to check its
+ * backrefs, we only do this once while walking up a
+ * tree as we will catch anything else later on.
+ */
+ if (!upper->checked && need_check) {
+ need_check = false;
+ list_add_tail(&edge->list[UPPER],
+ &cache->pending_edge);
+ } else {
+ if (upper->checked)
+ need_check = true;
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
+ } else {
+ upper = rb_entry(rb_node, struct btrfs_backref_node,
+ rb_node);
+ ASSERT(upper->checked);
+ INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
+ }
+ btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+
+ if (rb_node) {
+ btrfs_put_root(root);
+ break;
+ }
+ lower = upper;
+ upper = NULL;
+ }
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+/*
+ * Add backref node @cur into @cache.
+ *
+ * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
+ * links aren't yet bi-directional. Needs to finish such links.
+ * Use btrfs_backref_finish_upper_links() to finish such linkage.
+ *
+ * @path: Released path for indirect tree backref lookup
+ * @iter: Released backref iter for extent tree search
+ * @node_key: The first key of the tree block
+ */
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_backref_iter *iter,
+ struct btrfs_key *node_key,
+ struct btrfs_backref_node *cur)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *exist;
+ int ret;
+
+ ret = btrfs_backref_iter_start(iter, cur->bytenr);
+ if (ret < 0)
+ return ret;
+ /*
+ * We skip the first btrfs_tree_block_info, as we don't use the key
+ * stored in it, but fetch it from the tree block
+ */
+ if (btrfs_backref_has_tree_block_info(iter)) {
+ ret = btrfs_backref_iter_next(iter);
+ if (ret < 0)
+ goto out;
+ /* No extra backref? This means the tree block is corrupted */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ WARN_ON(cur->checked);
+ if (!list_empty(&cur->upper)) {
+ /*
+ * The backref was added previously when processing backref of
+ * type BTRFS_TREE_BLOCK_REF_KEY
+ */
+ ASSERT(list_is_singular(&cur->upper));
+ edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
+ list[LOWER]);
+ ASSERT(list_empty(&edge->list[UPPER]));
+ exist = edge->node[UPPER];
+ /*
+ * Add the upper level block to pending list if we need check
+ * its backrefs
+ */
+ if (!exist->checked)
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
+ } else {
+ exist = NULL;
+ }
+
+ for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
+ struct extent_buffer *eb;
+ struct btrfs_key key;
+ int type;
+
+ cond_resched();
+ eb = btrfs_backref_get_eb(iter);
+
+ key.objectid = iter->bytenr;
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
+ struct btrfs_extent_inline_ref *iref;
+
+ /* Update key for inline backref */
+ iref = (struct btrfs_extent_inline_ref *)
+ ((unsigned long)iter->cur_ptr);
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ key.type = type;
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+ } else {
+ key.type = iter->cur_key.type;
+ key.offset = iter->cur_key.offset;
+ }
+
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref
+ */
+ if (exist &&
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+ exist->owner == key.offset) ||
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+ exist->bytenr == key.offset))) {
+ exist = NULL;
+ continue;
+ }
+
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+ ret = handle_direct_tree_backref(cache, &key, cur);
+ if (ret < 0)
+ goto out;
+ continue;
+ } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
+ ret = -EINVAL;
+ btrfs_print_v0_err(fs_info);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
+ goto out;
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+ continue;
+ }
+
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
+ ret = handle_indirect_tree_backref(cache, path, &key, node_key,
+ cur);
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+ cur->checked = 1;
+ WARN_ON(exist);
+out:
+ btrfs_backref_iter_release(iter);
+ return ret;
+}
+
+/*
+ * Finish the upwards linkage created by btrfs_backref_add_tree_node()
+ */
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *start)
+{
+ struct list_head *useless_node = &cache->useless_node;
+ struct btrfs_backref_edge *edge;
+ struct rb_node *rb_node;
+ LIST_HEAD(pending_edge);
+
+ ASSERT(start->checked);
+
+ /* Insert this node to cache if it's not COW-only */
+ if (!start->cowonly) {
+ rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
+ &start->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(cache->fs_info, start->bytenr,
+ -EEXIST);
+ list_add_tail(&start->lower, &cache->leaves);
+ }
+
+ /*
+ * Use breadth first search to iterate all related edges.
+ *
+ * The starting points are all the edges of this node
+ */
+ list_for_each_entry(edge, &start->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &pending_edge);
+
+ while (!list_empty(&pending_edge)) {
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_node *lower;
+
+ edge = list_first_entry(&pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del_init(&edge->list[UPPER]);
+ upper = edge->node[UPPER];
+ lower = edge->node[LOWER];
+
+ /* Parent is detached, no need to keep any edges */
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ btrfs_backref_free_edge(cache, edge);
+
+ /* Lower node is orphan, queue for cleanup */
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, useless_node);
+ continue;
+ }
+
+ /*
+ * All new nodes added in current build_backref_tree() haven't
+ * been linked to the cache rb tree.
+ * So if we have upper->rb_node populated, this means a cache
+ * hit. We only need to link the edge, as @upper and all its
+ * parents have already been linked.
+ */
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+ continue;
+ }
+
+ /* Sanity check, we shouldn't have any unchecked nodes */
+ if (!upper->checked) {
+ ASSERT(0);
+ return -EUCLEAN;
+ }
+
+ /* Sanity check, COW-only node has non-COW-only parent */
+ if (start->cowonly != upper->cowonly) {
+ ASSERT(0);
+ return -EUCLEAN;
+ }
+
+ /* Only cache non-COW-only (subvolume trees) tree blocks */
+ if (!upper->cowonly) {
+ rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ if (rb_node) {
+ btrfs_backref_panic(cache->fs_info,
+ upper->bytenr, -EEXIST);
+ return -EUCLEAN;
+ }
+ }
+
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+
+ /*
+ * Also queue all the parent edges of this uncached node
+ * to finish the upper linkage
+ */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &pending_edge);
+ }
+ return 0;
+}
+
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ struct btrfs_backref_node *lower;
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+
+ while (!list_empty(&cache->useless_node)) {
+ lower = list_first_entry(&cache->useless_node,
+ struct btrfs_backref_node, list);
+ list_del_init(&lower->list);
+ }
+ while (!list_empty(&cache->pending_edge)) {
+ edge = list_first_entry(&cache->pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ upper = edge->node[UPPER];
+ btrfs_backref_free_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes and
+ * isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &cache->useless_node);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to process */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER],
+ &cache->pending_edge);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &cache->useless_node);
+ }
+
+ while (!list_empty(&cache->useless_node)) {
+ lower = list_first_entry(&cache->useless_node,
+ struct btrfs_backref_node, list);
+ list_del_init(&lower->list);
+ if (lower == node)
+ node = NULL;
+ btrfs_backref_drop_node(cache, lower);
+ }
+
+ btrfs_backref_cleanup_node(cache, node);
+ ASSERT(list_empty(&cache->useless_node) &&
+ list_empty(&cache->pending_edge));
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 777f61dc081e..8e69584d538d 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -8,6 +8,7 @@
#include <linux/btrfs.h>
#include "ulist.h"
+#include "disk-io.h"
#include "extent_io.h"
struct inode_fs_paths {
@@ -16,6 +17,21 @@ struct inode_fs_paths {
struct btrfs_data_container *fspath;
};
+struct btrfs_backref_shared_cache_entry {
+ u64 bytenr;
+ u64 gen;
+ bool is_shared;
+};
+
+struct btrfs_backref_shared_cache {
+ /*
+ * A path from a root to a leaf that has a file extent item pointing to
+ * a given data extent should never exceed the maximum b+tree height.
+ */
+ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+ bool use_cache;
+};
+
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
@@ -34,15 +50,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
+ struct btrfs_path *path, void *ctx,
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots, bool ignore_offset);
+ u64 time_seq, struct ulist **roots,
+ bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
@@ -57,8 +77,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp_ulist);
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
@@ -74,4 +96,303 @@ struct prelim_ref {
u64 wanted_disk_byte;
};
+/*
+ * Iterate backrefs of one extent.
+ *
+ * Now it only supports iteration of tree block in commit root.
+ */
+struct btrfs_backref_iter {
+ u64 bytenr;
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_key cur_key;
+ u32 item_ptr;
+ u32 cur_ptr;
+ u32 end_ptr;
+};
+
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
+
+static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
+{
+ if (!iter)
+ return;
+ btrfs_free_path(iter->path);
+ kfree(iter);
+}
+
+static inline struct extent_buffer *btrfs_backref_get_eb(
+ struct btrfs_backref_iter *iter)
+{
+ if (!iter)
+ return NULL;
+ return iter->path->nodes[0];
+}
+
+/*
+ * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
+ * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
+ *
+ * This helper determines if that's the case.
+ */
+static inline bool btrfs_backref_has_tree_block_info(
+ struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item))
+ return true;
+ return false;
+}
+
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
+
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
+
+static inline bool btrfs_backref_iter_is_inline_ref(
+ struct btrfs_backref_iter *iter)
+{
+ if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
+ return true;
+ return false;
+}
+
+static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
+{
+ iter->bytenr = 0;
+ iter->item_ptr = 0;
+ iter->cur_ptr = 0;
+ iter->end_ptr = 0;
+ btrfs_release_path(iter->path);
+ memset(&iter->cur_key, 0, sizeof(iter->cur_key));
+}
+
+/*
+ * Backref cache related structures
+ *
+ * The whole objective of backref_cache is to build a bi-directional map
+ * of tree blocks (represented by backref_node) and all their parents.
+ */
+
+/*
+ * Represent a tree block in the backref cache
+ */
+struct btrfs_backref_node {
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simple_node for search/insert */
+
+ u64 new_bytenr;
+ /* Objectid of tree block owner, can be not uptodate */
+ u64 owner;
+ /* Link to pending, changed or detached list */
+ struct list_head list;
+
+ /* List of upper level edges, which link this node to its parents */
+ struct list_head upper;
+ /* List of lower level edges, which link this node to its children */
+ struct list_head lower;
+
+ /* NULL if this node is not tree root */
+ struct btrfs_root *root;
+ /* Extent buffer got by COWing the block */
+ struct extent_buffer *eb;
+ /* Level of the tree block */
+ unsigned int level:8;
+ /* Is the block in a non-shareable tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node is in the cache */
+ unsigned int lowest:1;
+ /* Is the extent buffer locked */
+ unsigned int locked:1;
+ /* Has the block been processed */
+ unsigned int processed:1;
+ /* Have backrefs of this block been checked */
+ unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been COWed but some upper level block
+ * pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /* 1 if the backref node isn't connected to any other backref node */
+ unsigned int detached:1;
+
+ /*
+ * For generic purpose backref cache, where we only care if it's a reloc
+ * root, doesn't care the source subvolid.
+ */
+ unsigned int is_reloc_root:1;
+};
+
+#define LOWER 0
+#define UPPER 1
+
+/*
+ * Represent an edge connecting upper and lower backref nodes.
+ */
+struct btrfs_backref_edge {
+ /*
+ * list[LOWER] is linked to btrfs_backref_node::upper of lower level
+ * node, and list[UPPER] is linked to btrfs_backref_node::lower of
+ * upper level node.
+ *
+ * Also, build_backref_tree() uses list[UPPER] for pending edges, before
+ * linking list[UPPER] to its upper level nodes.
+ */
+ struct list_head list[2];
+
+ /* Two related nodes */
+ struct btrfs_backref_node *node[2];
+};
+
+struct btrfs_backref_cache {
+ /* Red black tree of all backref nodes in the cache */
+ struct rb_root rb_root;
+ /* For passing backref nodes to btrfs_reloc_cow_block */
+ struct btrfs_backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * List of blocks that have been COWed but some block pointers in upper
+ * level blocks may not reflect the new location
+ */
+ struct list_head pending[BTRFS_MAX_LEVEL];
+ /* List of backref nodes with no child node */
+ struct list_head leaves;
+ /* List of blocks that have been COWed in current transaction */
+ struct list_head changed;
+ /* List of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
+
+ /* List of unchecked backref edges during backref cache build */
+ struct list_head pending_edge;
+
+ /* List of useless backref nodes during backref cache build */
+ struct list_head useless_node;
+
+ struct btrfs_fs_info *fs_info;
+
+ /*
+ * Whether this cache is for relocation
+ *
+ * Reloction backref cache require more info for reloc root compared
+ * to generic backref cache.
+ */
+ unsigned int is_reloc;
+};
+
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_backref_cache *cache, int is_reloc);
+struct btrfs_backref_node *btrfs_backref_alloc_node(
+ struct btrfs_backref_cache *cache, u64 bytenr, int level);
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
+ struct btrfs_backref_cache *cache);
+
+#define LINK_LOWER (1 << 0)
+#define LINK_UPPER (1 << 1)
+static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper,
+ int link_which)
+{
+ ASSERT(upper && lower && upper->level == lower->level + 1);
+ edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
+ if (link_which & LINK_LOWER)
+ list_add_tail(&edge->list[LOWER], &lower->upper);
+ if (link_which & LINK_UPPER)
+ list_add_tail(&edge->list[UPPER], &upper->lower);
+}
+
+static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node)
+{
+ if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
+ cache->nr_nodes--;
+ btrfs_put_root(node->root);
+ kfree(node);
+ }
+}
+
+static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_edge *edge)
+{
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
+}
+
+static inline void btrfs_backref_unlock_node_buffer(
+ struct btrfs_backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
+static inline void btrfs_backref_drop_node_buffer(
+ struct btrfs_backref_node *node)
+{
+ if (node->eb) {
+ btrfs_backref_unlock_node_buffer(node);
+ free_extent_buffer(node->eb);
+ node->eb = NULL;
+ }
+}
+
+/*
+ * Drop the backref node from cache without cleaning up its children
+ * edges.
+ *
+ * This can only be called on node without parent edges.
+ * The children edges are still kept as is.
+ */
+static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
+ struct btrfs_backref_node *node)
+{
+ ASSERT(list_empty(&node->upper));
+
+ btrfs_backref_drop_node_buffer(node);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ btrfs_backref_free_node(tree, node);
+}
+
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
+
+static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
+ u64 bytenr, int errno)
+{
+ btrfs_panic(fs_info, errno,
+ "Inconsistency in backref cache found at offset %llu",
+ bytenr);
+}
+
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+ struct btrfs_path *path,
+ struct btrfs_backref_iter *iter,
+ struct btrfs_key *node_key,
+ struct btrfs_backref_node *cur);
+
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *start);
+
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node);
+
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 7f09147872dc..deebc8ddbd93 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
@@ -7,7 +8,6 @@
#include "disk-io.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "disk-io.h"
#include "volumes.h"
#include "transaction.h"
#include "ref-verify.h"
@@ -16,6 +16,7 @@
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
+#include "zoned.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -66,11 +67,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
- /* Pick target profile only if it's already available */
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
- spin_unlock(&fs_info->balance_lock);
- return extended_to_chunk(target);
- }
+ spin_unlock(&fs_info->balance_lock);
+ return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
@@ -119,14 +117,23 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
- atomic_inc(&cache->count);
+ refcount_inc(&cache->refs);
}
void btrfs_put_block_group(struct btrfs_block_group *cache)
{
- if (atomic_dec_and_test(&cache->count)) {
+ if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
- WARN_ON(cache->reserved > 0);
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on reserved > 0 in that
+ * case.
+ */
+ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
+ WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
@@ -147,6 +154,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
+ kfree(cache->physical_map);
kfree(cache);
}
}
@@ -160,9 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
+ bool leftmost = true;
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
+ ASSERT(block_group->length != 0);
+
+ write_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_root.rb_node;
while (*p) {
parent = *p;
@@ -171,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
p = &(*p)->rb_left;
} else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
-
- if (info->first_logical_byte > block_group->start)
- info->first_logical_byte = block_group->start;
+ rb_insert_color_cached(&block_group->cache_node,
+ &info->block_group_cache_tree, leftmost);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return 0;
}
@@ -200,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -223,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
- if (ret) {
+ if (ret)
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->start)
- info->first_logical_byte = ret->start;
- }
- spin_unlock(&info->block_group_cache_lock);
+ read_unlock(&info->block_group_cache_lock);
return ret;
}
@@ -257,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
- spin_lock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -274,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
struct btrfs_block_group *bg;
- bool ret = true;
+ bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
- return false;
+ return NULL;
spin_lock(&bg->lock);
if (bg->ro)
- ret = false;
+ can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
- /* No put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
+ if (!can_nocow) {
btrfs_put_block_group(bg);
+ return NULL;
+ }
- return ret;
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
}
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * @bg: The block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
- struct btrfs_block_group *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@@ -410,18 +440,22 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
@@ -460,7 +494,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
int ret;
while (start < end) {
- ret = find_first_extent_bit(info->pinned_extents, start,
+ ret = find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY | EXTENT_UPTODATE,
NULL);
@@ -496,7 +530,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -511,6 +545,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
+ extent_root = btrfs_extent_root(fs_info, last);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -558,8 +593,6 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -583,9 +616,6 @@ next:
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -620,7 +650,6 @@ next:
total_found += add_new_free_space(block_group, last,
block_group->start + block_group->length);
- caching_ctl->progress = (u64)-1;
out:
btrfs_free_path(path);
@@ -641,11 +670,36 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ ret = load_free_space_cache(block_group);
+ if (ret == 1) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * We failed to load the space cache, set ourselves to
+ * CACHE_STARTED and carry on.
+ */
+ spin_lock(&block_group->lock);
+ block_group->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&block_group->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
-
+done:
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
@@ -665,8 +719,6 @@ static noinline void caching_thread(struct btrfs_work *work)
}
#endif
- caching_ctl->progress = (u64)-1;
-
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
@@ -677,13 +729,16 @@ static noinline void caching_thread(struct btrfs_work *work)
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_caching_control *caching_ctl;
+ struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
+ /* Allocator for zoned filesystems does not use the cache at all */
+ if (btrfs_is_zoned(fs_info))
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -692,120 +747,37 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->start;
- refcount_set(&caching_ctl->count, 1);
+ refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
- /*
- * This should be a rare occasion, but this could happen I think in the
- * case where one thread starts to load the space cache info, and then
- * some other thread starts a transaction commit which tries to do an
- * allocation while the other thread is still loading the space cache
- * info. The previous loop should have kept us from choosing this block
- * group, but if we've moved to the state where we will wait on caching
- * block groups we need to first check if we're doing a fast load here,
- * so we can wait for it to finish, otherwise we could end up allocating
- * from a block group who's cache gets evicted for one reason or
- * another.
- */
- while (cache->cached == BTRFS_CACHE_FAST) {
- struct btrfs_caching_control *ctl;
-
- ctl = cache->caching_ctl;
- refcount_inc(&ctl->count);
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&cache->lock);
-
- schedule();
-
- finish_wait(&ctl->wait, &wait);
- btrfs_put_caching_control(ctl);
- spin_lock(&cache->lock);
- }
-
if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
kfree(caching_ctl);
- return 0;
+
+ caching_ctl = cache->caching_ctl;
+ if (caching_ctl)
+ refcount_inc(&caching_ctl->count);
+ spin_unlock(&cache->lock);
+ goto out;
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_FAST;
+ cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
- mutex_lock(&caching_ctl->mutex);
- ret = load_free_space_cache(cache);
-
- spin_lock(&cache->lock);
- if (ret == 1) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_FINISHED;
- cache->last_byte_to_unpin = (u64)-1;
- caching_ctl->progress = (u64)-1;
- } else {
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- }
- spin_unlock(&cache->lock);
-#ifdef CONFIG_BTRFS_DEBUG
- if (ret == 1 &&
- btrfs_should_fragment_free_space(cache)) {
- u64 bytes_used;
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- bytes_used = cache->length - cache->used;
- cache->space_info->bytes_used += bytes_used >> 1;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fragment_free_space(cache);
- }
-#endif
- mutex_unlock(&caching_ctl->mutex);
-
- wake_up(&caching_ctl->wait);
- if (ret == 1) {
- btrfs_put_caching_control(caching_ctl);
- btrfs_free_excluded_extents(cache);
- return 0;
- }
- } else {
- /*
- * We're either using the free space tree or no caching at all.
- * Set cached to the appropriate value and wakeup any waiters.
- */
- spin_lock(&cache->lock);
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- spin_unlock(&cache->lock);
- wake_up(&caching_ctl->wait);
- }
-
- if (load_cache_only) {
- btrfs_put_caching_control(caching_ctl);
- return 0;
- }
-
- down_write(&fs_info->commit_root_sem);
+ write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->commit_root_sem);
+ write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+out:
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
+ if (caching_ctl)
+ btrfs_put_caching_control(caching_ctl);
return ret;
}
@@ -863,16 +835,37 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
+static int remove_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int ret;
+
+ root = btrfs_block_group_root(fs_info);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_del_item(trans, root, path);
+ return ret;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
struct btrfs_path *path;
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_key key;
struct inode *inode;
struct kobject *kobj = NULL;
int ret;
@@ -913,6 +906,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -950,51 +946,19 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&trans->transaction->dirty_bgs_lock);
mutex_unlock(&trans->transaction->cache_write_mutex);
- if (!IS_ERR(inode)) {
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
- btrfs_add_delayed_iput(inode);
- goto out;
- }
- clear_nlink(inode);
- /* One for the block groups ref */
- spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- iput(inode);
- } else {
- spin_unlock(&block_group->lock);
- }
- /* One for our lookup ref */
- btrfs_add_delayed_iput(inode);
- }
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.type = 0;
- key.offset = block_group->start;
-
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ ret = btrfs_remove_free_space_inode(trans, inode, block_group);
+ if (ret)
goto out;
- if (ret > 0)
- btrfs_release_path(path);
- if (ret == 0) {
- ret = btrfs_del_item(trans, tree_root, path);
- if (ret)
- goto out;
- btrfs_release_path(path);
- }
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- if (fs_info->first_logical_byte == block_group->start)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
+ write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -1014,32 +978,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- down_write(&fs_info->commit_root_sem);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- up_write(&fs_info->commit_root_sem);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
}
}
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1055,36 +1018,65 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
WARN_ON(block_group->space_info->total_bytes
< block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length);
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
+ WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags) &&
+ block_group->space_info->active_total_bytes
+ < block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -= block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ block_group->space_info->active_total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ block_group->space_info->bytes_zone_unusable -=
+ block_group->zone_unusable;
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
- key.objectid = block_group->start;
- key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- key.offset = block_group->length;
+ /*
+ * Remove the free space for the block group from the free space tree
+ * and the block group's item from the extent tree before marking the
+ * block group as removed. This is to prevent races with tasks that
+ * freeze and unfreeze a block group, this task and another task
+ * allocating a new block group - the unfreeze task ends up removing
+ * the block group's extent map before the task calling this function
+ * deletes the block group item from the extent tree, allowing for
+ * another task to attempt to create another block group with the same
+ * item key (and failing with -EEXIST and a transaction abort).
+ */
+ ret = remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_item(trans, path, block_group);
+ if (ret < 0)
+ goto out;
- mutex_lock(&fs_info->chunk_mutex);
spin_lock(&block_group->lock);
- block_group->removed = 1;
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
/*
- * At this point trimming can't start on this block group, because we
- * removed the block group from the tree fs_info->block_group_cache_tree
- * so no one can't find it anymore and even if someone already got this
- * block group before we removed it from the rbtree, they have already
- * incremented block_group->trimming - if they didn't, they won't find
- * any free space entries because we already removed them all when we
- * called btrfs_remove_free_space_cache().
+ * At this point trimming or scrub can't start on this block group,
+ * because we removed the block group from the rbtree
+ * fs_info->block_group_cache_tree so no one can't find it anymore and
+ * even if someone already got this block group before we removed it
+ * from the rbtree, they have already incremented block_group->frozen -
+ * if they didn't, for the trimming case they won't find any free space
+ * entries because we already removed them all when we called
+ * btrfs_remove_free_space_cache().
*
* And we must not remove the extent map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
- * ranges from being reused for a new block group. This is because our
- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+ * ranges from being reused for a new block group. This is needed to
+ * avoid races with trimming and scrub.
+ *
+ * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
* completely transactionless, so while it is trimming a range the
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
@@ -1095,28 +1087,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->trimming) == 0);
+ remove_em = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- mutex_unlock(&fs_info->chunk_mutex);
-
- ret = remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out;
-
- btrfs_put_block_group(block_group);
- btrfs_put_block_group(block_group);
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret > 0)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- ret = btrfs_del_item(trans, root, path);
- if (ret)
- goto out;
-
if (remove_em) {
struct extent_map_tree *em_tree;
@@ -1127,7 +1100,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
/* once for the tree */
free_extent_map(em);
}
+
out:
+ /* Once for the lookup reference */
+ btrfs_put_block_group(block_group);
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
@@ -1137,6 +1113,7 @@ out:
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -1170,8 +1147,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
num_items = 3 + map->num_stripes;
free_extent_map(em);
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items, 1);
+ return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
/*
@@ -1196,6 +1172,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
+ if (cache->swap_extents) {
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (cache->ro) {
cache->ro++;
ret = 0;
@@ -1203,7 +1184,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
}
num_bytes = cache->length - cache->reserved - cache->pinned -
- cache->bytes_super - cache->used;
+ cache->bytes_super - cache->zone_unusable - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1234,6 +1215,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (!ret) {
sinfo->bytes_readonly += num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes to readonly */
+ sinfo->bytes_readonly += cache->zone_unusable;
+ sinfo->bytes_zone_unusable -= cache->zone_unusable;
+ cache->zone_unusable = 0;
+ }
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
@@ -1248,6 +1235,51 @@ out:
return ret;
}
+static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_transaction *prev_trans = NULL;
+ const u64 start = bg->start;
+ const u64 end = start + bg->length - 1;
+ int ret;
+
+ spin_lock(&fs_info->trans_lock);
+ if (trans->transaction->list.prev != &fs_info->trans_list) {
+ prev_trans = list_last_entry(&trans->transaction->list,
+ struct btrfs_transaction, list);
+ refcount_inc(&prev_trans->use_count);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N, another
+ * task might be running finish_extent_commit() for the previous
+ * transaction N - 1, and have seen a range belonging to the block
+ * group in pinned_extents before we were able to clear the whole block
+ * group range from pinned_extents. This means that task can lookup for
+ * the block group after we unpinned it from pinned_extents and removed
+ * it, leading to a BUG_ON() at unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans) {
+ ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY);
+ if (ret)
+ goto out;
+ }
+
+ ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY);
+out:
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans)
+ btrfs_put_transaction(prev_trans);
+
+ return ret == 0;
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -1263,9 +1295,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip deletion if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
+ return;
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
- u64 start, end;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1283,8 +1324,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -1328,6 +1367,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto next;
+ }
+
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@@ -1344,35 +1391,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
- start = block_group->start;
- end = start + block_group->length - 1;
- /*
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
- * btrfs_finish_extent_commit(). If we are at transaction N,
- * another task might be running finish_extent_commit() for the
- * previous transaction N - 1, and have seen a range belonging
- * to the block group in freed_extents[] before we were able to
- * clear the whole block group range from freed_extents[]. This
- * means that task can lookup for the block group after we
- * unpinned it from freed_extents[] and removed it, leading to
- * a BUG_ON() at btrfs_unpin_extent_range().
- */
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ if (!clean_pinned_extents(trans, block_group)) {
btrfs_dec_block_group_ro(block_group);
goto end_trans;
}
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/*
* At this point, the block_group is read only and should fail
@@ -1398,9 +1420,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1416,12 +1435,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
+ /*
+ * DISCARD can flip during remount. On zoned filesystems, we
+ * need to reset sequential-required zones.
+ */
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */
if (trimming)
- btrfs_get_block_group_trimming(block_group);
+ btrfs_freeze_block_group(block_group);
/*
* Btrfs_remove_chunk will abort the transaction if things go
@@ -1431,7 +1454,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (ret) {
if (trimming)
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
goto end_trans;
}
@@ -1455,16 +1478,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans);
next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
@@ -1482,83 +1505,223 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- struct btrfs_key *key)
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+ const struct list_head *b)
{
- struct btrfs_root *root = fs_info->extent_root;
- int ret = 0;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
+ const struct btrfs_block_group *bg1, *bg2;
+
+ bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+ bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+ return bg1->used > bg2->used;
+}
+
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
+void btrfs_reclaim_bgs_work(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info =
+ container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
+ sb_start_write(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ sb_end_write(fs_info->sb);
+ return;
+ }
+
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip reclaim if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+ btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
+ return;
+ }
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * Sort happens under lock because we can't simply splice it and sort.
+ * The block groups might still be in use and reachable via bg_list,
+ * and their presence in the reclaim_bgs list must be preserved.
+ */
+ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
+ while (!list_empty(&fs_info->reclaim_bgs)) {
+ u64 zone_unusable;
+ int ret = 0;
+
+ bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&bg->bg_list);
+
+ space_info = bg->space_info;
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&bg->lock);
+
+ /* Get out fast, in case we're unmounting the filesystem */
+ if (btrfs_fs_closing(fs_info)) {
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the blog group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore.
+ */
+ zone_unusable = bg->zone_unusable;
+ ret = inc_block_group_ro(bg, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ goto next;
+
+ btrfs_info(fs_info,
+ "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+ bg->start, div_u64(bg->used * 100, bg->length),
+ div64_u64(zone_unusable * 100, bg->length));
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start);
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+ }
+
+next:
+ btrfs_put_block_group(bg);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
+}
+
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&fs_info->reclaim_bgs))
+ queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_reclaim_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
struct btrfs_block_group_item bg;
- u64 flags;
+ struct extent_buffer *leaf;
int slot;
+ u64 flags;
+ int ret = 0;
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- goto out;
+ slot = path->slots[0];
+ leaf = path->nodes[0];
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ em_tree = &fs_info->mapping_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ key->objectid, key->offset);
+ return -ENOENT;
+ }
+
+ if (em->start != key->objectid || em->len != key->offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ key->objectid, key->offset, em->start, em->len);
+ ret = -EUCLEAN;
+ goto out_free_em;
+ }
+ read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_stack_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ key->objectid, key->offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ ret = -EUCLEAN;
+ }
+
+out_free_em:
+ free_extent_map(em);
+ return ret;
+}
+
+static int find_first_block_group(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ int ret;
+ struct btrfs_key found_key;
+
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- em_tree = &root->fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, found_key.objectid,
- found_key.offset);
- read_unlock(&em_tree->lock);
- if (!em) {
- btrfs_err(fs_info,
- "logical %llu len %llu found bg but no related chunk",
- found_key.objectid, found_key.offset);
- ret = -ENOENT;
- } else if (em->start != found_key.objectid ||
- em->len != found_key.offset) {
- btrfs_err(fs_info,
- "block group %llu len %llu mismatch with chunk %llu len %llu",
- found_key.objectid, found_key.offset,
- em->start, em->len);
- ret = -EUCLEAN;
- } else {
- read_extent_buffer(leaf, &bg,
- btrfs_item_ptr_offset(leaf, slot),
- sizeof(bg));
- flags = btrfs_stack_block_group_flags(&bg) &
- BTRFS_BLOCK_GROUP_TYPE_MASK;
-
- if (flags != (em->map_lookup->type &
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
- btrfs_err(fs_info,
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
- found_key.objectid,
- found_key.offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
- em->map_lookup->type));
- ret = -EUCLEAN;
- } else {
- ret = 0;
- }
- }
- free_extent_map(em);
- goto out;
+ return read_bg_from_eb(fs_info, &found_key, path);
}
- path->slots[0]++;
}
-out:
return ret;
}
@@ -1578,8 +1741,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
/**
- * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * Map a physical disk address to a list of logical addresses
+ *
+ * @fs_info: the filesystem
* @chunk_start: logical address of block group
+ * @bdev: physical device to resolve, can be NULL to indicate any device
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
@@ -1589,9 +1755,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
-EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -1607,19 +1773,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
return -EIO;
map = em->map_lookup;
- data_stripe_length = em->len;
+ data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
+ chunk_start = em->start;
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- data_stripe_length = div_u64(data_stripe_length,
- map->num_stripes / map->sub_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- data_stripe_length = div_u64(data_stripe_length,
- nr_data_stripes(map));
+ /* For RAID5/6 adjust to a full IO stripe length */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
io_stripe_size = map->stripe_len * nr_data_stripes(map);
- }
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
@@ -1630,20 +1790,23 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
u64 stripe_nr;
+ u64 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
+ if (bdev && map->stripes[i].dev->bdev != bdev)
+ continue;
+
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
}
/*
* The remaining case would be for RAID56, multiply by
@@ -1651,7 +1814,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
* instead of map->stripe_len
*/
- bytenr = chunk_start + stripe_nr * io_stripe_size;
+ bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) {
@@ -1676,6 +1839,7 @@ out:
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ const bool zoned = btrfs_is_zoned(fs_info);
u64 bytenr;
u64 *logical;
int stripe_len;
@@ -1692,31 +1856,26 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->start,
+ ret = btrfs_rmap_block(fs_info, cache->start, NULL,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
- while (nr--) {
- u64 start, len;
-
- if (logical[nr] > cache->start + cache->length)
- continue;
-
- if (logical[nr] + stripe_len <= cache->start)
- continue;
+ /* Shouldn't have super stripes in sequential zones */
+ if (zoned && nr) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu must not contain super block",
+ cache->start);
+ return -EUCLEAN;
+ }
- start = logical[nr];
- if (start < cache->start) {
- start = cache->start;
- len = (logical[nr] + stripe_len) - start;
- } else {
- len = min_t(u64, stripe_len,
- cache->start + cache->length - start);
- }
+ while (nr--) {
+ u64 len = min_t(u64, stripe_len,
+ cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = btrfs_add_excluded_extent(fs_info, start, len);
+ ret = btrfs_add_excluded_extent(fs_info, logical[nr],
+ len);
if (ret) {
kfree(logical);
return ret;
@@ -1728,24 +1887,8 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
-
- down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
-}
-
static struct btrfs_block_group *btrfs_create_block_group_cache(
- struct btrfs_fs_info *fs_info, u64 start, u64 size)
+ struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
@@ -1761,15 +1904,13 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
}
cache->start = start;
- cache->length = size;
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
- atomic_set(&cache->count, 1);
+ refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
@@ -1779,10 +1920,12 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
- btrfs_init_free_space_ctl(cache);
- atomic_set(&cache->trimming, 0);
+ INIT_LIST_HEAD(&cache->active_bg_list);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
+ atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+ cache->full_stripe_locks_root.root = RB_ROOT;
+ mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
@@ -1842,24 +1985,27 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
}
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_path *path,
+ struct btrfs_block_group_item *bgi,
const struct btrfs_key *key,
int need_clear)
{
- struct extent_buffer *leaf = path->nodes[0];
struct btrfs_block_group *cache;
- struct btrfs_space_info *space_info;
- struct btrfs_block_group_item bgi;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
- int slot = path->slots[0];
int ret;
ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
- cache = btrfs_create_block_group_cache(info, key->objectid, key->offset);
+ cache = btrfs_create_block_group_cache(info, key->objectid);
if (!cache)
return -ENOMEM;
+ cache->length = key->offset;
+ cache->used = btrfs_stack_block_group_used(bgi);
+ cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+
+ set_free_space_tree_thresholds(cache);
+
if (need_clear) {
/*
* When we mount with old space cache, we need to
@@ -1874,10 +2020,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (btrfs_test_opt(info, SPACE_CACHE))
cache->disk_cache_state = BTRFS_DC_CLEAR;
}
- read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
- cache->used = btrfs_stack_block_group_used(&bgi);
- cache->flags = btrfs_stack_block_group_flags(&bgi);
if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
btrfs_err(info,
@@ -1887,6 +2029,13 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
+ ret = btrfs_load_block_group_zone_info(cache, false);
+ if (ret) {
+ btrfs_err(info, "zoned: failed to load zone info of bg %llu",
+ cache->start);
+ goto error;
+ }
+
/*
* We need to exclude the super stripes now so that the space info has
* super bytes accounted for, otherwise we'll think we have more space
@@ -1900,20 +2049,28 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
/*
- * Check for two cases, either we are full, and therefore don't need
- * to bother with the caching work since we won't find any space, or we
- * are empty, and we can just add all the space in and be done with it.
- * This saves us _a_lot_ of time, particularly in the full case.
+ * For zoned filesystem, space after the allocation offset is the only
+ * free space for a block group. So, we don't need any caching work.
+ * btrfs_calc_zone_unusable() will set the amount of free space and
+ * zone_unusable space.
+ *
+ * For regular filesystem, check for two cases, either we are full, and
+ * therefore don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all the space
+ * in and be done with it. This saves us _a_lot_ of time, particularly
+ * in the full case.
*/
- if (key->offset == cache->used) {
- cache->last_byte_to_unpin = (u64)-1;
+ if (btrfs_is_zoned(info)) {
+ btrfs_calc_zone_unusable(cache);
+ /* Should not have any excluded extents. Just in case, though. */
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->length == cache->used) {
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, key->objectid,
- key->objectid + key->offset);
+ add_new_free_space(cache, cache->start,
+ cache->start + cache->length);
btrfs_free_excluded_extents(cache);
}
@@ -1923,31 +2080,81 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, key->offset,
- cache->used, cache->bytes_super, &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
+ btrfs_add_bg_to_space_info(info, cache);
set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->start)) {
+ if (btrfs_chunk_writeable(info, cache->start)) {
+ if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
+ }
+ } else {
inc_block_group_ro(cache, 1);
- } else if (cache->used == 0) {
- ASSERT(list_empty(&cache->bg_list));
- if (btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_discard_queue_work(&info->discard_ctl, cache);
- else
- btrfs_mark_bg_unused(cache);
}
+
return 0;
error:
btrfs_put_block_group(cache);
return ret;
}
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct rb_node *node;
+ int ret = 0;
+
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_block_group *bg;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ map = em->map_lookup;
+ bg = btrfs_create_block_group_cache(fs_info, em->start);
+ if (!bg) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* Fill dummy cache as FULL */
+ bg->length = em->len;
+ bg->flags = map->type;
+ bg->cached = BTRFS_CACHE_FINISHED;
+ bg->used = em->len;
+ bg->flags = map->type;
+ ret = btrfs_add_block_group_cache(fs_info, bg);
+ /*
+ * We may have some valid block group cache added already, in
+ * that case we skip to the next one.
+ */
+ if (ret == -EEXIST) {
+ ret = 0;
+ btrfs_put_block_group(bg);
+ continue;
+ }
+
+ if (ret) {
+ btrfs_remove_free_space_cache(bg);
+ btrfs_put_block_group(bg);
+ break;
+ }
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+
+ set_avail_alloc_bits(fs_info, bg->flags);
+ }
+ if (!ret)
+ btrfs_init_global_block_rsv(fs_info);
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
+ struct btrfs_root *root = btrfs_block_group_root(info);
struct btrfs_path *path;
int ret;
struct btrfs_block_group *cache;
@@ -1956,13 +2163,24 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
+ return fill_dummy_bgs(info);
+
key.objectid = 0;
key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(info->super_copy);
if (btrfs_test_opt(info, SPACE_CACHE) &&
@@ -1972,22 +2190,44 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
+ struct btrfs_block_group_item bgi;
+ struct extent_buffer *leaf;
+ int slot;
+
ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
goto error;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- ret = read_one_block_group(info, path, &key, need_clear);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_release_path(path);
+ ret = read_one_block_group(info, &bgi, &key, need_clear);
if (ret < 0)
goto error;
key.objectid += key.offset;
key.offset = 0;
- btrfs_release_path(path);
}
+ btrfs_release_path(path);
+
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2012,46 +2252,184 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
+ /*
+ * We've hit some error while reading the extent tree, and have
+ * rescue=ibadroots mount option.
+ * Try to fill the tree using dummy block groups so that the user can
+ * continue to mount and grab their data.
+ */
+ if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+ ret = fill_dummy_bgs(info);
return ret;
}
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group *block_group;
- struct btrfs_root *extent_root = fs_info->extent_root;
- struct btrfs_block_group_item item;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
+ struct btrfs_key key;
+
+ spin_lock(&block_group->lock);
+ btrfs_set_stack_block_group_used(&bgi, block_group->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ block_group->global_root_id);
+ btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
+ spin_unlock(&block_group->lock);
+
+ return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+}
+
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
struct btrfs_key key;
+ int ret;
+
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i;
int ret = 0;
- if (!trans->can_flush_pending_bgs)
- return;
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ stripe_size = em->orig_block_len;
+
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * resulting in persisting a device extent item with such ID.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group;
+ int ret = 0;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
- spin_lock(&block_group->lock);
- btrfs_set_stack_block_group_used(&item, block_group->used);
- btrfs_set_stack_block_group_chunk_objectid(&item,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_stack_block_group_flags(&item, block_group->flags);
- key.objectid = block_group->start;
- key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- key.offset = block_group->length;
- spin_unlock(&block_group->lock);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
- ret = btrfs_insert_item(trans, extent_root, &key, &item,
- sizeof(item));
+ ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+ ret = insert_dev_extents(trans, block_group->start,
+ block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2060,8 +2438,30 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size)
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2069,35 +2469,38 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
btrfs_set_log_full_commit(trans);
- cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
+ cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
if (!cache)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+ cache->length = size;
+ set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- cache->needs_free_space = 1;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ cache->needs_free_space = 1;
+
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ERR_PTR(ret);
+ }
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
btrfs_free_excluded_extents(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
add_new_free_space(cache, chunk_offset, chunk_offset + size);
btrfs_free_excluded_extents(cache);
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
- }
-#endif
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
@@ -2110,7 +2513,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
/*
@@ -2118,66 +2521,24 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, &cache->space_info);
+ btrfs_add_bg_to_space_info(fs_info, cache);
btrfs_update_global_block_rsv(fs_info);
- link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ cache->space_info->bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
trans->delayed_ref_updates++;
btrfs_update_delayed_refs_rsv(trans);
set_avail_alloc_bits(fs_info, type);
- return 0;
-}
-
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
-{
- u64 num_devices;
- u64 stripped;
-
- /*
- * if restripe for this chunk_type is on pick target profile and
- * return, otherwise do the usual balance
- */
- stripped = get_restripe_target(fs_info, flags);
- if (stripped)
- return extended_to_chunk(stripped);
-
- num_devices = fs_info->fs_devices->rw_devices;
-
- stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
- BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
-
- if (num_devices == 1) {
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* turn raid0 into single device chunks */
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return stripped;
-
- /* turn mirroring into duplication */
- if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
- BTRFS_BLOCK_GROUP_RAID10))
- return stripped | BTRFS_BLOCK_GROUP_DUP;
- } else {
- /* they already had raid on here, just return */
- if (flags & stripped)
- return flags;
-
- stripped |= BTRFS_BLOCK_GROUP_DUP;
- stripped = flags & ~stripped;
-
- /* switch duplicated blocks with raid1 */
- if (flags & BTRFS_BLOCK_GROUP_DUP)
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
-
- /* this is drive concat, leave it alone */
- }
-
- return flags;
+ return cache;
}
/*
@@ -2194,38 +2555,56 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
int ret;
-
-again:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ bool dirty_bg_running;
/*
- * we're not allowed to set block groups readonly after the dirty
- * block groups cache has started writing. If it already started,
- * back off and let this transaction commit
+ * This can only happen when we are doing read-only scrub on read-only
+ * mount.
+ * In that case we should not start a new transaction on read-only fs.
+ * Thus here we skip all chunk allocations.
*/
- mutex_lock(&fs_info->ro_block_group_mutex);
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
- u64 transid = trans->transid;
-
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ ret = inc_block_group_ro(cache, 0);
mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans);
-
- ret = btrfs_wait_for_commit(fs_info, transid);
- if (ret)
- return ret;
- goto again;
+ return ret;
}
+ do {
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ dirty_bg_running = false;
+
+ /*
+ * We're not allowed to set block groups readonly after the dirty
+ * block group cache has started writing. If it already started,
+ * back off and let this transaction commit.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ dirty_bg_running = true;
+ }
+ } while (dirty_bg_running);
+
if (do_chunk_alloc) {
/*
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
ret = btrfs_chunk_alloc(trans, alloc_flags,
CHUNK_ALLOC_FORCE);
@@ -2241,7 +2620,7 @@ again:
}
ret = inc_block_group_ro(cache, 0);
- if (!do_chunk_alloc)
+ if (!do_chunk_alloc || ret == -ETXTBSY)
goto unlock_out;
if (!ret)
goto out;
@@ -2249,10 +2628,20 @@ again:
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
+ /*
+ * We have allocated a new chunk. We also need to activate that chunk to
+ * grant metadata tickets for zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ if (ret < 0)
+ goto out;
+
ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2274,8 +2663,17 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes back */
+ cache->zone_unusable =
+ (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ sinfo->bytes_zone_unusable += cache->zone_unusable;
+ sinfo->bytes_readonly -= cache->zone_unusable;
+ }
num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super - cache->used;
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
@@ -2283,13 +2681,13 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_unlock(&sinfo->lock);
}
-static int write_one_cache_group(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_block_group *cache)
+static int update_block_group_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
@@ -2299,7 +2697,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = cache->length;
- ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -2310,7 +2708,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
@@ -2330,10 +2728,13 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
- u64 num_pages = 0;
+ u64 cache_size = 0;
int retries = 0;
int ret = 0;
+ if (!btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
/*
* If this block group is smaller than 100 megs don't bother caching the
* block group.
@@ -2345,7 +2746,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
return 0;
}
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
again:
inode = lookup_free_space_inode(block_group, path);
@@ -2374,7 +2775,7 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
@@ -2439,19 +2840,20 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->length, SZ_256M);
- if (!num_pages)
- num_pages = 1;
+ cache_size = div_u64(block_group->length, SZ_256M);
+ if (!cache_size)
+ cache_size = 1;
- num_pages *= 16;
- num_pages *= PAGE_SIZE;
+ cache_size *= 16;
+ cache_size *= fs_info->sectorsize;
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
+ cache_size, false);
if (ret)
goto out_put;
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
- num_pages, num_pages,
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
+ cache_size, cache_size,
&alloc_hint);
/*
* Our cache requires contiguous chunks so that we don't modify a bunch
@@ -2529,7 +2931,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2546,8 +2947,10 @@ again:
if (!path) {
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
/*
@@ -2593,7 +2996,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2611,7 +3013,7 @@ again:
}
}
if (!ret) {
- ret = write_one_cache_group(trans, path, cache);
+ ret = update_block_group_item(trans, path, cache);
/*
* Our block group might still be attached to the list
* of new block groups in the transaction handle of some
@@ -2641,16 +3043,14 @@ again:
btrfs_put_block_group(cache);
if (drop_reserve)
btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
@@ -2659,7 +3059,8 @@ again:
* Go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2673,7 +3074,12 @@ again:
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
@@ -2690,7 +3096,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2748,7 +3153,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -2760,7 +3164,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
}
}
if (!ret) {
- ret = write_one_cache_group(trans, path, cache);
+ ret = update_block_group_item(trans, path, cache);
/*
* One of the free space endio workers might have
* created a new block group while updating a free space
@@ -2771,13 +3175,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- ret = write_one_cache_group(trans, path, cache);
+ ret = update_block_group_item(trans, path, cache);
}
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2807,8 +3211,33 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
+static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc)
+ u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_block_group *cache = NULL;
@@ -2829,6 +3258,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ bool reclaim;
+
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@@ -2843,7 +3274,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, true);
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
@@ -2874,14 +3305,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(info->pinned_extents,
+ set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
}
@@ -2904,6 +3333,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
@@ -2947,6 +3378,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2980,6 +3418,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2988,12 +3428,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3029,11 +3467,203 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+{
+ struct btrfs_block_group *bg;
+ int ret;
+
+ /*
+ * Check if we have enough space in the system space info because we
+ * will need to update device items in the chunk btree and insert a new
+ * chunk item in the chunk btree as well. This will allocate a new
+ * system block group if needed.
+ */
+ check_system_chunk(trans, flags);
+
+ bg = btrfs_create_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ /*
+ * Normally we are not expected to fail with -ENOSPC here, since we have
+ * previously reserved space in the system space_info and allocated one
+ * new system chunk if necessary. However there are three exceptions:
+ *
+ * 1) We may have enough free space in the system space_info but all the
+ * existing system block groups have a profile which can not be used
+ * for extent allocation.
+ *
+ * This happens when mounting in degraded mode. For example we have a
+ * RAID1 filesystem with 2 devices, lose one device and mount the fs
+ * using the other device in degraded mode. If we then allocate a chunk,
+ * we may have enough free space in the existing system space_info, but
+ * none of the block groups can be used for extent allocation since they
+ * have a RAID1 profile, and because we are in degraded mode with a
+ * single device, we are forced to allocate a new system chunk with a
+ * SINGLE profile. Making check_system_chunk() iterate over all system
+ * block groups and check if they have a usable profile and enough space
+ * can be slow on very large filesystems, so we tolerate the -ENOSPC and
+ * try again after forcing allocation of a new system chunk. Like this
+ * we avoid paying the cost of that search in normal circumstances, when
+ * we were not mounted in degraded mode;
+ *
+ * 2) We had enough free space info the system space_info, and one suitable
+ * block group to allocate from when we called check_system_chunk()
+ * above. However right after we called it, the only system block group
+ * with enough free space got turned into RO mode by a running scrub,
+ * and in this case we have to allocate a new one and retry. We only
+ * need do this allocate and retry once, since we have a transaction
+ * handle and scrub uses the commit root to search for block groups;
+ *
+ * 3) We had one system block group with enough free space when we called
+ * check_system_chunk(), but after that, right before we tried to
+ * allocate the last extent buffer we needed, a discard operation came
+ * in and it temporarily removed the last free space entry from the
+ * block group (discard removes a free space entry, discards it, and
+ * then adds back the entry to the block group cache).
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ btrfs_trans_release_chunk_metadata(trans);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
+}
+
/*
- * If force is CHUNK_ALLOC_FORCE:
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ * the chunk, the chunk mapping, create its block group and add the items
+ * that belong in the chunk btree to it - more specifically, we need to
+ * update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ * group item to the extent btree and the device extent items to the devices
+ * btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ * metadata space_info and therefore don't trigger allocation of a new chunk.
+ * However later when the task actually tries to COW an extent buffer from
+ * the extent btree or from the device btree for example, it is forced to
+ * allocate a new block group (chunk) because the only one that had enough
+ * free space was just turned to RO mode by a running scrub for example (or
+ * device replace, block group reclaim thread, etc), so we can not use it
+ * for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ * we end up not allocating a new metadata chunk in that case. However if
+ * the filesystem was mounted in degraded mode, none of the existing block
+ * groups might be suitable for extent allocation due to their incompatible
+ * profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ * use a RAID1 profile, in degraded mode using a single device). In this case
+ * when the task attempts to COW some extent buffer of the extent btree for
+ * example, it will trigger allocation of a new metadata block group with a
+ * suitable profile (SINGLE profile in the example of the degraded mount of
+ * the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ * it attempts to COW an extent buffer from the extent or device btree for
+ * example, it does not find any free extent in any metadata block group,
+ * therefore forced to try to allocate a new metadata block group.
+ * This is because some other task allocated all available extents in the
+ * meanwhile - this typically happens with tasks that don't reserve space
+ * properly, either intentionally or as a bug. One example where this is
+ * done intentionally is fsync, as it does not reserve any transaction units
+ * and ends up allocating a variable number of metadata extents for log
+ * tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ * before it tries to allocate the last extent buffer it needs, a discard
+ * operation comes in and, temporarily, removes the last free space entry from
+ * the only metadata block group that had free space (discard starts by
+ * removing a free space entry from a block group, then does the discard
+ * operation and, once it's done, it adds back the free space entry to the
+ * block group).
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
+ * If @force is NOT CHUNK_ALLOC_FORCE:
* - return 0 if it doesn't need to allocate a new chunk,
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
@@ -3043,13 +3673,43 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
+ bool from_extent_allocation = false;
int ret = 0;
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
+ /*
+ * Allocation of system chunks can not happen through this path, as we
+ * could end up in a deadlock if we are allocating a data or metadata
+ * chunk and there is another task modifying the chunk btree.
+ *
+ * This is because while we are holding the chunk mutex, we will attempt
+ * to add the new chunk item to the chunk btree or update an existing
+ * device item in the chunk btree, while the other task that is modifying
+ * the chunk btree is attempting to COW an extent buffer while holding a
+ * lock on it and on its parent - if the COW operation triggers a system
+ * chunk allocation, then we can deadlock because we are holding the
+ * chunk mutex and we may need to access that extent buffer or its parent
+ * in order to add the chunk item or update a device item.
+ *
+ * Tasks that want to modify the chunk tree should reserve system space
+ * before updating the chunk btree, by calling either
+ * btrfs_reserve_chunk_metadata() or check_system_chunk().
+ * It's possible that after a task reserves the space, it still ends up
+ * here - this happens in the cases described above at do_chunk_alloc().
+ * The task will have to either retry or fail.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
ASSERT(space_info);
@@ -3078,6 +3738,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
@@ -3113,15 +3774,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- /*
- * Check if we have enough space in SYSTEM chunk because we may need
- * to update devices.
- */
- check_system_chunk(trans, flags);
-
- ret = btrfs_alloc_chunk(trans, flags);
+ ret_bg = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
@@ -3138,22 +3806,6 @@ out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
- /*
- * When we allocate a new chunk we reserve space in the chunk block
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
- * add new nodes/leafs to it if we end up needing to do it when
- * inserting the chunk item and updating device items as part of the
- * second phase of chunk allocation, performed by
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
- * large number of new block groups to create in our transaction
- * handle's new_bgs list to avoid exhausting the chunk block reserve
- * in extreme cases - like having a single transaction create many new
- * block groups when starting to write out the free space caches of all
- * the block groups that were made dirty during the lifetime of the
- * transaction.
- */
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
- btrfs_create_pending_block_groups(trans);
return ret;
}
@@ -3169,17 +3821,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
return num_dev;
}
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ u64 bytes,
+ u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
- u64 thresh;
int ret = 0;
- u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
@@ -3192,20 +3841,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
- num_devs = get_profile_num_devs(fs_info, type);
-
- /* num_devs device items to update and 1 chunk item to add or remove */
- thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
- btrfs_calc_insert_metadata_size(fs_info, 1);
-
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+ if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
- left, thresh, type);
+ left, bytes, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
- if (left < thresh) {
+ if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *bg;
/*
* Ignore failure to create system chunk. We might end up not
@@ -3213,50 +3857,111 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
- ret = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ } else {
+ /*
+ * We have a new chunk. We also need to activate it for
+ * zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ if (ret < 0)
+ return;
+
+ /*
+ * If we fail to add the chunk item here, we end up
+ * trying again at phase 2 of chunk allocation, at
+ * btrfs_create_pending_block_groups(). So ignore
+ * any error here. An ENOSPC here could happen, due to
+ * the cases described at do_chunk_alloc() - the system
+ * block group we just created was just turned into RO
+ * mode by a scrub for example, or a running discard
+ * temporarily removed its free space entries, etc.
+ */
+ btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ }
}
if (!ret) {
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
+ ret = btrfs_block_rsv_add(fs_info,
&fs_info->chunk_block_rsv,
- thresh, BTRFS_RESERVE_NO_FLUSH);
+ bytes, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
- trans->chunk_bytes_reserved += thresh;
+ trans->chunk_bytes_reserved += bytes;
}
}
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 num_devs = get_profile_num_devs(fs_info, type);
+ u64 bytes;
+
+ /* num_devs device items to update and 1 chunk item to add or remove. */
+ bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+ btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans: A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ * in the chunk btree or if it's for the deletion or update
+ * of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 bytes;
+
+ if (is_item_insertion)
+ bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ else
+ bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+ mutex_lock(&fs_info->chunk_mutex);
+ reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+ mutex_unlock(&fs_info->chunk_mutex);
+}
+
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
- u64 last = 0;
- while (1) {
- struct inode *inode;
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct inode *inode = block_group->inode;
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- btrfs_wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
+ block_group->inode = NULL;
spin_unlock(&block_group->lock);
- block_group = btrfs_next_block_group(block_group);
- }
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->start + block_group->length;
- btrfs_put_block_group(block_group);
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ block_group = btrfs_next_block_group(block_group);
}
}
@@ -3272,14 +3977,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->commit_root_sem);
+ write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- up_write(&info->commit_root_sem);
+ write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@@ -3289,16 +3994,34 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
+
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
spin_unlock(&info->unused_bgs_lock);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ spin_lock(&info->zone_active_bgs_lock);
+ while (!list_empty(&info->zone_active_bgs)) {
+ block_group = list_first_entry(&info->zone_active_bgs,
+ struct btrfs_block_group,
+ active_bg_list);
+ list_del_init(&block_group->active_bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->zone_active_bgs_lock);
+
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@@ -3317,20 +4040,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
- ASSERT(atomic_read(&block_group->count) == 1);
+ ASSERT(refcount_read(&block_group->refs) == 1);
+ ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
}
- spin_unlock(&info->block_group_cache_lock);
-
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
+ write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
@@ -3344,11 +4060,87 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
* important and indicates a real bug if this happens.
*/
if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on bytes_reserved > 0 in
+ * that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+ }
+
+ WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
return 0;
}
+
+void btrfs_freeze_block_group(struct btrfs_block_group *cache)
+{
+ atomic_inc(&cache->frozen);
+}
+
+void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
+
+ spin_lock(&block_group->lock);
+ cleanup = (atomic_dec_and_test(&block_group->frozen) &&
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
+ spin_unlock(&block_group->lock);
+
+ if (cleanup) {
+ em_tree = &fs_info->mapping_tree;
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, block_group->start,
+ 1);
+ BUG_ON(!em); /* logic error, can't happen */
+ remove_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+
+ /* once for us and once for the tree */
+ free_extent_map(em);
+ free_extent_map(em);
+
+ /*
+ * We may have left one free space entry and other possible
+ * tasks trimming this block group have left 1 entry each one.
+ * Free them if any.
+ */
+ btrfs_remove_free_space_cache(block_group);
+ }
+}
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
+{
+ bool ret = true;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ bg->swap_extents++;
+ spin_unlock(&bg->lock);
+
+ return ret;
+}
+
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
+{
+ spin_lock(&bg->lock);
+ ASSERT(!bg->ro);
+ ASSERT(bg->swap_extents >= amount);
+ bg->swap_extents -= amount;
+ spin_unlock(&bg->lock);
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 107bb557ca8d..8fb14b99a1d1 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -35,11 +35,33 @@ enum btrfs_discard_state {
* the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
+ * find_free_extent() that also activaes the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT,
+};
+
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+ BLOCK_GROUP_FLAG_IREF,
+ BLOCK_GROUP_FLAG_REMOVED,
+ BLOCK_GROUP_FLAG_TO_COPY,
+ BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+ BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
};
struct btrfs_caching_control {
@@ -48,13 +70,20 @@ struct btrfs_caching_control {
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
- u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
@@ -68,6 +97,7 @@ struct btrfs_block_group {
u64 bytes_super;
u64 flags;
u64 cache_generation;
+ u64 global_root_id;
/*
* If the free space extent count exceeds this number, convert the block
@@ -90,18 +120,15 @@ struct btrfs_block_group {
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
+ unsigned long runtime_flags;
unsigned int ro;
- unsigned int iref:1;
- unsigned int has_caching_ctl:1;
- unsigned int removed:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
- u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -114,8 +141,7 @@ struct btrfs_block_group {
/* For block groups in the same raid type */
struct list_head list;
- /* Usage count */
- atomic_t count;
+ refcount_t refs;
/*
* List of struct btrfs_free_clusters for this block group.
@@ -129,8 +155,17 @@ struct btrfs_block_group {
/* For read-only block groups */
struct list_head ro_list;
+ /*
+ * When non-zero it means the block group's logical address and its
+ * device extents can not be reused for future block group allocations
+ * until the counter goes down to 0. This is to prevent them from being
+ * reused while some task is still using the block group after it was
+ * deleted - we want to make sure they can only be reused for new block
+ * groups after that task is done with the deleted block group.
+ */
+ atomic_t frozen;
+
/* For discard operations */
- atomic_t trimming;
struct list_head discard_list;
int discard_index;
u64 discard_eligible_time;
@@ -173,8 +208,30 @@ struct btrfs_block_group {
*/
int needs_free_space;
+ /* Flag indicating this block group is placed on a sequential zone */
+ bool seq_zone;
+
+ /*
+ * Number of extents in this block group used for swap files.
+ * All accesses protected by the spinlock 'lock'.
+ */
+ int swap_extents;
+
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+
+ /*
+ * Allocation offset for the block group to implement sequential
+ * allocation. This is used only on a zoned filesystem.
+ */
+ u64 alloc_offset;
+ u64 zone_unusable;
+ u64 zone_capacity;
+ u64 meta_write_pointer;
+ struct map_lookup *physical_map;
+ struct list_head active_bg_list;
+ struct work_struct zone_finish_work;
+ struct extent_buffer *last_eb;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -217,14 +274,13 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
-int btrfs_cache_block_group(struct btrfs_block_group *cache,
- int load_cache_only);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
@@ -237,9 +293,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_bgs_work(struct work_struct *work);
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
@@ -248,7 +308,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, int alloc);
+ u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -257,9 +317,14 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+ bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len);
static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
@@ -283,9 +348,10 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
cache->cached == BTRFS_CACHE_ERROR;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len);
-#endif
+void btrfs_freeze_block_group(struct btrfs_block_group *cache);
+void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index d07bd41a7c1e..ec96285357e0 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -5,6 +5,100 @@
#include "block-rsv.h"
#include "space-info.h"
#include "transaction.h"
+#include "block-group.h"
+#include "disk-io.h"
+
+/*
+ * HOW DO BLOCK RESERVES WORK
+ *
+ * Think of block_rsv's as buckets for logically grouped metadata
+ * reservations. Each block_rsv has a ->size and a ->reserved. ->size is
+ * how large we want our block rsv to be, ->reserved is how much space is
+ * currently reserved for this block reserve.
+ *
+ * ->failfast exists for the truncate case, and is described below.
+ *
+ * NORMAL OPERATION
+ *
+ * -> Reserve
+ * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
+ *
+ * We call into btrfs_reserve_metadata_bytes() with our bytes, which is
+ * accounted for in space_info->bytes_may_use, and then add the bytes to
+ * ->reserved, and ->size in the case of btrfs_block_rsv_add.
+ *
+ * ->size is an over-estimation of how much we may use for a particular
+ * operation.
+ *
+ * -> Use
+ * Entrance: btrfs_use_block_rsv
+ *
+ * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
+ * to determine the appropriate block_rsv to use, and then verify that
+ * ->reserved has enough space for our tree block allocation. Once
+ * successful we subtract fs_info->nodesize from ->reserved.
+ *
+ * -> Finish
+ * Entrance: btrfs_block_rsv_release
+ *
+ * We are finished with our operation, subtract our individual reservation
+ * from ->size, and then subtract ->size from ->reserved and free up the
+ * excess if there is any.
+ *
+ * There is some logic here to refill the delayed refs rsv or the global rsv
+ * as needed, otherwise the excess is subtracted from
+ * space_info->bytes_may_use.
+ *
+ * TYPES OF BLOCK RESERVES
+ *
+ * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
+ * These behave normally, as described above, just within the confines of the
+ * lifetime of their particular operation (transaction for the whole trans
+ * handle lifetime, for example).
+ *
+ * BLOCK_RSV_GLOBAL
+ * It is impossible to properly account for all the space that may be required
+ * to make our extent tree updates. This block reserve acts as an overflow
+ * buffer in case our delayed refs reserve does not reserve enough space to
+ * update the extent tree.
+ *
+ * We can steal from this in some cases as well, notably on evict() or
+ * truncate() in order to help users recover from ENOSPC conditions.
+ *
+ * BLOCK_RSV_DELALLOC
+ * The individual item sizes are determined by the per-inode size
+ * calculations, which are described with the delalloc code. This is pretty
+ * straightforward, it's just the calculation of ->size encodes a lot of
+ * different items, and thus it gets used when updating inodes, inserting file
+ * extents, and inserting checksums.
+ *
+ * BLOCK_RSV_DELREFS
+ * We keep a running tally of how many delayed refs we have on the system.
+ * We assume each one of these delayed refs are going to use a full
+ * reservation. We use the transaction items and pre-reserve space for every
+ * operation, and use this reservation to refill any gap between ->size and
+ * ->reserved that may exist.
+ *
+ * From there it's straightforward, removing a delayed ref means we remove its
+ * count from ->size and free up reservations as necessary. Since this is
+ * the most dynamic block reserve in the system, we will try to refill this
+ * block reserve first with any excess returned by any other block reserve.
+ *
+ * BLOCK_RSV_EMPTY
+ * This is the fallback block reserve to make us try to reserve space if we
+ * don't have a specific bucket for this allocation. It is mostly used for
+ * updating the device tree and such, since that is a separate pool we're
+ * content to just reserve space from the space_info on demand.
+ *
+ * BLOCK_RSV_TEMP
+ * This is used by things like truncate and iput. We will temporarily
+ * allocate a block reserve, set it to some size, and then truncate bytes
+ * until we have no space left. With ->failfast set we'll simply return
+ * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
+ * to make a new reservation. This is because these operations are
+ * unbounded, so we want to do as much work as we can, and then back off and
+ * re-reserve.
+ */
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
@@ -24,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
block_rsv->reserved = block_rsv->size;
- block_rsv->full = 1;
+ block_rsv->full = true;
} else {
num_bytes = 0;
}
@@ -48,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
bytes_to_add = min(num_bytes, bytes_to_add);
dest->reserved += bytes_to_add;
if (dest->reserved >= dest->size)
- dest->full = 1;
+ dest->full = true;
num_bytes -= bytes_to_add;
}
spin_unlock(&dest->lock);
@@ -77,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
@@ -86,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
btrfs_init_block_rsv(rsv, type);
rsv->space_info = btrfs_find_space_info(fs_info,
@@ -94,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
struct btrfs_block_rsv *block_rsv;
@@ -111,11 +205,11 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
{
if (!rsv)
return;
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
kfree(rsv);
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
@@ -124,7 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -148,7 +242,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush)
{
@@ -169,7 +263,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -178,9 +272,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, u64 *qgroup_to_release)
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ u64 *qgroup_to_release)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
@@ -192,7 +286,7 @@ u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
- else if (block_rsv != global_rsv && !delayed_rsv->full)
+ else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
@@ -210,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
if (block_rsv->reserved >= num_bytes) {
block_rsv->reserved -= num_bytes;
if (block_rsv->reserved < block_rsv->size)
- block_rsv->full = 0;
+ block_rsv->full = false;
ret = 0;
}
spin_unlock(&block_rsv->lock);
@@ -225,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
if (update_size)
block_rsv->size += num_bytes;
else if (block_rsv->reserved >= block_rsv->size)
- block_rsv->full = 1;
+ block_rsv->full = true;
spin_unlock(&block_rsv->lock);
}
@@ -247,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
+ global_rsv->full = false;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
@@ -258,23 +352,29 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
- u64 num_bytes;
- unsigned min_items;
+ struct btrfs_root *root, *tmp;
+ u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
+ unsigned int min_items = 1;
/*
* The global block rsv is based on the size of the extent tree, the
* checksum tree and the root tree. If the fs is empty we want to set
* it to a minimal amount for safety.
+ *
+ * We also are going to need to modify the minimum of the tree root and
+ * any global roots we could touch.
*/
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
- btrfs_root_used(&fs_info->csum_root->root_item) +
- btrfs_root_used(&fs_info->tree_root->root_item);
-
- /*
- * We at a minimum are going to modify the csum root, the tree root, and
- * the extent root.
- */
- min_items = 3;
+ read_lock(&fs_info->global_root_lock);
+ rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
+ rb_node) {
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ num_bytes += btrfs_root_used(&root->root_item);
+ min_items++;
+ }
+ }
+ read_unlock(&fs_info->global_root_lock);
/*
* But we also want to reserve enough space so we can do the fallback
@@ -297,9 +397,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
- block_rsv->reserved += num_bytes;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
+ block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
@@ -308,15 +408,39 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
btrfs_try_granting_tickets(fs_info, sinfo);
}
- if (block_rsv->reserved == block_rsv->size)
- block_rsv->full = 1;
- else
- block_rsv->full = 0;
+ block_rsv->full = (block_rsv->reserved == block_rsv->size);
+ if (block_rsv->size >= sinfo->total_bytes)
+ sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&block_rsv->lock);
spin_unlock(&sinfo->lock);
}
+void btrfs_init_root_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ switch (root->root_key.objectid) {
+ case BTRFS_CSUM_TREE_OBJECTID:
+ case BTRFS_EXTENT_TREE_OBJECTID:
+ case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ root->block_rsv = &fs_info->delayed_refs_rsv;
+ break;
+ case BTRFS_ROOT_TREE_OBJECTID:
+ case BTRFS_DEV_TREE_OBJECTID:
+ case BTRFS_QUOTA_TREE_OBJECTID:
+ root->block_rsv = &fs_info->global_block_rsv;
+ break;
+ case BTRFS_CHUNK_TREE_OBJECTID:
+ root->block_rsv = &fs_info->chunk_block_rsv;
+ break;
+ default:
+ root->block_rsv = NULL;
+ break;
+ }
+}
+
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
@@ -331,20 +455,13 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
- fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
- if (fs_info->quota_root)
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-
btrfs_update_global_block_rsv(fs_info);
}
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
- btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
+ NULL);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -362,9 +479,10 @@ static struct btrfs_block_rsv *get_block_rsv(
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- (root == fs_info->csum_root && trans->adding_csums) ||
- (root == fs_info->uuid_root))
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
+ (root == fs_info->uuid_root) ||
+ (trans->adding_csums &&
+ root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -415,10 +533,11 @@ again:
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
- "BTRFS: block rsv returned %d\n", ret);
+ "BTRFS: block rsv %d returned %d\n",
+ block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d1428bb73fc5..578c3497a455 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum;
/*
* Types of block reserves
*/
-enum {
+enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_GLOBAL,
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
@@ -25,9 +25,10 @@ struct btrfs_block_rsv {
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned short full;
- unsigned short type;
- unsigned short failfast;
+ bool full;
+ bool failfast;
+ /* Block reserve type, one of BTRFS_BLOCK_RSV_* */
+ enum btrfs_rsv_type type:8;
/*
* Qgroup equivalent for @size @reserved
@@ -49,19 +50,20 @@ struct btrfs_block_rsv {
u64 qgroup_rsv_reserved;
};
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type);
+void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
-int btrfs_block_rsv_refill(struct btrfs_root *root,
+int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -73,7 +75,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
-u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release);
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
@@ -82,20 +84,21 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
-
-static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
-}
-
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u32 blocksize)
{
btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
- btrfs_block_rsv_release(fs_info, block_rsv, 0);
+ btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
+}
+
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+ return data_race(rsv->full);
}
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4e12a477d32e..54c2ccb36b61 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -7,12 +7,20 @@
#define BTRFS_INODE_H
#include <linux/hash.h>
+#include <linux/refcount.h>
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
#include "delayed-inode.h"
/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
+/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -20,16 +28,45 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
+ /*
+ * Set and used when logging an inode and it serves to signal that an
+ * inode does not have xattrs, so subsequent fsyncs can avoid searching
+ * for xattrs to log. This bit must be cleared whenever a xattr is added
+ * to an inode.
+ */
+ BTRFS_INODE_NO_XATTRS,
+ /*
+ * Set when we are in a context where we need to start a transaction and
+ * have dirty pages with the respective file range locked. This is to
+ * ensure that when reserving space for the transaction, if we are low
+ * on available space and need to flush delalloc, we will not flush
+ * delalloc for this inode, because that could result in a deadlock (on
+ * the file range, inode's io_tree).
+ */
+ BTRFS_INODE_NO_DELALLOC_FLUSH,
+ /*
+ * Set when we are working on enabling verity for a file. Computing and
+ * writing the whole Merkle tree can take a while so we want to prevent
+ * races where two separate tasks attempt to simultaneously start verity
+ * on the same file.
+ */
+ BTRFS_INODE_VERITY_IN_PROGRESS,
+ /* Set when this inode is a free space inode. */
+ BTRFS_INODE_FREE_SPACE_INODE,
};
/* in memory btrfs inode */
@@ -45,7 +82,8 @@ struct btrfs_inode {
/*
* Lock for counters and all fields used to determine if the inode is in
* the log or not (last_trans, last_sub_trans, last_log_commit,
- * logged_trans).
+ * logged_trans), to access/update new_delalloc_bytes and to update the
+ * VFS' inode number of bytes used.
*/
spinlock_t lock;
@@ -58,7 +96,14 @@ struct btrfs_inode {
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
- struct extent_io_tree io_failure_tree;
+ struct rb_root io_failure_tree;
+ spinlock_t io_failure_lock;
+
+ /*
+ * Keep track of where the inode has extent items mapped in order to
+ * make sure the i_size adjustments are accurate
+ */
+ struct extent_io_tree file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -103,17 +148,26 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- /* total number of bytes pending delalloc, used by stat to calc the
- * real block usage of the file
+ /*
+ * Total number of bytes pending delalloc, used by stat to calculate the
+ * real block usage of the file. This is used only for files.
*/
u64 delalloc_bytes;
- /*
- * Total number of bytes pending delalloc that fall within a file
- * range that is either a hole or beyond EOF (and no prealloc extent
- * exists in the range). This is always <= delalloc_bytes.
- */
- u64 new_delalloc_bytes;
+ union {
+ /*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes and this
+ * is used only for files.
+ */
+ u64 new_delalloc_bytes;
+ /*
+ * The offset of the last dir index key that was logged.
+ * This is used only for directories.
+ */
+ u64 last_dir_index_offset;
+ };
/*
* total number of bytes pending defrag, used by stat to check whether
@@ -129,8 +183,9 @@ struct btrfs_inode {
u64 disk_i_size;
/*
- * if this is a directory then index_cnt is the counter for the index
- * number for new files that are created
+ * If this is a directory then index_cnt is the counter for the index
+ * number for new files that are created. For an empty directory, this
+ * must be initialized to BTRFS_DIR_START_INDEX.
*/
u64 index_cnt;
@@ -145,13 +200,26 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * The id/generation of the last transaction where this inode was
+ * either the source or the destination of a clone/dedupe operation.
+ * Used when logging an inode to know if there are shared extents that
+ * need special care when logging checksum items, to avoid duplicate
+ * checksum items in a log (which can lead to a corruption where we end
+ * up with missing checksum ranges after log replay).
+ * Protected by the vfs inode lock.
+ */
+ u64 last_reflink_trans;
+
+ /*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
u64 csum_bytes;
- /* flags field from the on disk inode */
+ /* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
+ /* Read-only compatibility flags, upper half of inode_item::flags */
+ u32 ro_flags;
/*
* Counters to keep track of the number of extent item's we may use due
@@ -181,16 +249,7 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
- /*
- * To avoid races between lockless (i_mutex not held) direct IO writes
- * and concurrent fsync requests. Direct IO writes must acquire read
- * access on this semaphore for creating an extent map and its
- * corresponding ordered extent. The fast fsync path must acquire write
- * access on this semaphore before it collects ordered extents and
- * extent maps.
- */
- struct rw_semaphore dio_sem;
-
+ struct rw_semaphore i_mmap_lock;
struct inode vfs_inode;
};
@@ -211,26 +270,31 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
-static inline void btrfs_insert_inode_hash(struct inode *inode)
-{
- unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
-
- __insert_inode_hash(inode, h);
-}
+#if BITS_PER_LONG == 32
+/*
+ * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
+ * we use the inode's location objectid which is a u64 to avoid truncation.
+ */
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
u64 ino = inode->location.objectid;
- /*
- * !ino: btree_inode
- * type == BTRFS_ROOT_ITEM_KEY: subvol dir
- */
- if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
+ if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
ino = inode->vfs_inode.i_ino;
return ino;
}
+#else
+
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
+{
+ return inode->vfs_inode.i_ino;
+}
+
+#endif
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
@@ -239,14 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
-
- if (root == root->fs_info->tree_root &&
- btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
- return true;
- if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
- return true;
- return false;
+ return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
static inline bool is_data_inode(struct inode *inode)
@@ -265,73 +322,89 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
-static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
+/*
+ * Should be called while holding the inode's VFS lock in exclusive mode or in a
+ * context where no one else can access the inode concurrently (during inode
+ * creation or when loading an inode from disk).
+ */
+static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ /*
+ * The inode may have been part of a reflink operation in the last
+ * transaction that modified it, and then a fsync has reset the
+ * last_reflink_trans to avoid subsequent fsyncs in the same
+ * transaction to do unnecessary work. So update last_reflink_trans
+ * to the last_trans value (we have to be pessimistic and assume a
+ * reflink happened).
+ *
+ * The ->last_trans is protected by the inode's spinlock and we can
+ * have a concurrent ordered extent completion update it. Also set
+ * last_reflink_trans to ->last_trans only if the former is less than
+ * the later, because we can be called in a context where
+ * last_reflink_trans was set to the current transaction generation
+ * while ->last_trans was not yet updated in the current transaction,
+ * and therefore has a lower value.
+ */
+ spin_lock(&inode->lock);
+ if (inode->last_reflink_trans < inode->last_trans)
+ inode->last_reflink_trans = inode->last_trans;
+ spin_unlock(&inode->lock);
+}
+
+static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
- int ret = 0;
+ bool ret = false;
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit) {
- /*
- * After a ranged fsync we might have left some extent maps
- * (that fall outside the fsync's range). So return false
- * here if the list isn't empty, to make sure btrfs_log_inode()
- * will be called and process those extent maps.
- */
- smp_mb();
- if (list_empty(&inode->extent_tree.modified_extents))
- ret = 1;
- }
+ inode->last_sub_trans <= inode->root->last_log_commit)
+ ret = true;
spin_unlock(&inode->lock);
return ret;
}
-#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
-
-struct btrfs_dio_private {
- struct inode *inode;
- unsigned long flags;
- u64 logical_offset;
- u64 disk_bytenr;
- u64 bytes;
- void *private;
-
- /* number of bios pending for this dio */
- atomic_t pending_bios;
-
- /* IO errors */
- int errors;
-
- /* orig_bio is our btrfs_io_bio */
- struct bio *orig_bio;
-
- /* dio_bio came from fs/direct-io.c */
- struct bio *dio_bio;
-
- /*
- * The original bio may be split to several sub-bios, this is
- * done during endio of sub-bios
- */
- blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
- blk_status_t);
-};
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
*/
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
+ return (flags | ((u64)ro_flags << 32));
}
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ u32 *flags, u32 *ro_flags)
{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
+ *flags = (u32)inode_item_flags;
+ *ro_flags = (u32)(inode_item_flags >> 32);
}
/* Array of bytes with variable length, hexadecimal format 0x1234 */
@@ -342,8 +415,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
- struct btrfs_super_block *sb = root->fs_info->super_copy;
- const u16 csum_size = btrfs_super_csum_size(sb);
+ const u32 csum_size = root->fs_info->csum_size;
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index a0ce69f2d27c..98c6e5feab19 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -77,9 +77,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/mutex.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
@@ -152,12 +150,9 @@ struct btrfsic_block {
struct list_head ref_to_list; /* list */
struct list_head ref_from_list; /* list */
struct btrfsic_block *next_in_same_bio;
- void *orig_bio_bh_private;
- union {
- bio_end_io_t *bio;
- bh_end_io_t *bh;
- } orig_bio_bh_end_io;
- int submit_bio_bh_rw;
+ void *orig_bio_private;
+ bio_end_io_t *orig_bio_end_io;
+ blk_opf_t submit_bio_bh_rw;
u64 flush_gen; /* only valid if !never_written */
};
@@ -190,7 +185,6 @@ struct btrfsic_dev_state {
struct list_head collision_resolving_node; /* list node */
struct btrfsic_block dummy_block_for_bio_bh_flush;
u64 last_flush_gen;
- char name[BDEVNAME_SIZE];
};
struct btrfsic_block_hashtable {
@@ -237,7 +231,6 @@ struct btrfsic_stack_frame {
struct btrfsic_state {
u32 print_mask;
int include_extent_data;
- int csum_size;
struct list_head all_blocks_list;
struct btrfsic_block_hashtable block_hashtable;
struct btrfsic_block_link_hashtable block_link_hashtable;
@@ -248,47 +241,6 @@ struct btrfsic_state {
u32 datablock_size;
};
-static void btrfsic_block_init(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_alloc(void);
-static void btrfsic_block_free(struct btrfsic_block *b);
-static void btrfsic_block_link_init(struct btrfsic_block_link *n);
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
-static void btrfsic_block_link_free(struct btrfsic_block_link *n);
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
- struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
- struct block_device *bdev,
- u64 dev_bytenr,
- struct btrfsic_block_hashtable *h);
-static void btrfsic_block_link_hashtable_init(
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_add(
- struct btrfsic_block_link *l,
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
- struct block_device *bdev_ref_to,
- u64 dev_bytenr_ref_to,
- struct block_device *bdev_ref_from,
- u64 dev_bytenr_ref_from,
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_dev_state_hashtable_init(
- struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_add(
- struct btrfsic_dev_state *ds,
- struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
- struct btrfsic_dev_state_hashtable *h);
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
-static int btrfsic_process_superblock(struct btrfsic_state *state,
- struct btrfs_fs_devices *fs_devices);
static int btrfsic_process_metablock(struct btrfsic_state *state,
struct btrfsic_block *block,
struct btrfsic_block_data_ctx *block_ctx,
@@ -318,21 +270,11 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
-static void btrfsic_dump_database(struct btrfsic_state *state);
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- char **datav, unsigned int num_pages);
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char **mapped_datav,
- unsigned int num_pages,
- struct bio *bio, int *bio_is_patched,
- struct buffer_head *bh,
- int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
static void btrfsic_bio_end_io(struct bio *bp);
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int recursion_level);
@@ -399,8 +341,8 @@ static void btrfsic_block_init(struct btrfsic_block *b)
b->never_written = 0;
b->mirror_num = 0;
b->next_in_same_bio = NULL;
- b->orig_bio_bh_private = NULL;
- b->orig_bio_bh_end_io.bio = NULL;
+ b->orig_bio_private = NULL;
+ b->orig_bio_end_io = NULL;
INIT_LIST_HEAD(&b->collision_resolving_node);
INIT_LIST_HEAD(&b->all_blocks_node);
INIT_LIST_HEAD(&b->ref_to_list);
@@ -459,7 +401,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
ds->bdev = NULL;
ds->state = NULL;
- ds->name[0] = '\0';
INIT_LIST_HEAD(&ds->collision_resolving_node);
ds->last_flush_gen = 0;
btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -637,10 +578,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int pass;
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (NULL == selected_super) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!selected_super)
return -ENOMEM;
- }
list_for_each_entry(device, dev_head, dev_list) {
int i;
@@ -668,8 +607,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
return -1;
}
- state->csum_size = btrfs_super_csum_size(selected_super);
-
for (pass = 0; pass < 3; pass++) {
int num_copies;
int mirror_num;
@@ -767,29 +704,31 @@ static int btrfsic_process_superblock_dev_mirror(
struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *super_tmp;
u64 dev_bytenr;
- struct buffer_head *bh;
struct btrfsic_block *superblock_tmp;
int pass;
struct block_device *const superblock_bdev = device->bdev;
+ struct page *page;
+ struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
+ int ret = 0;
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
- bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (NULL == bh)
+
+ page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
return -1;
- super_tmp = (struct btrfs_super_block *)
- (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1)));
+
+ super_tmp = page_address(page);
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
- brelse(bh);
- return 0;
+ ret = 0;
+ goto out;
}
superblock_tmp =
@@ -799,9 +738,8 @@ static int btrfsic_process_superblock_dev_mirror(
if (NULL == superblock_tmp) {
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
- pr_info("btrfsic: error, kmalloc failed!\n");
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
/* for superblock, only the dev_bytenr makes sense */
superblock_tmp->dev_bytenr = dev_bytenr;
@@ -815,10 +753,10 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+ "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
- dev_state->name, dev_bytenr,
+ dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,
&state->all_blocks_list);
@@ -885,8 +823,8 @@ static int btrfsic_process_superblock_dev_mirror(
mirror_num)) {
pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
next_bytenr, mirror_num);
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
next_block = btrfsic_block_lookup_or_add(
@@ -895,8 +833,8 @@ static int btrfsic_process_superblock_dev_mirror(
mirror_num, NULL);
if (NULL == next_block) {
btrfsic_release_block_ctx(&tmp_next_block_ctx);
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
next_block->disk_key = tmp_disk_key;
@@ -907,16 +845,17 @@ static int btrfsic_process_superblock_dev_mirror(
BTRFSIC_GENERATION_UNKNOWN);
btrfsic_release_block_ctx(&tmp_next_block_ctx);
if (NULL == l) {
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
}
}
if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
btrfsic_dump_tree_sub(state, superblock_tmp, 0);
- brelse(bh);
- return 0;
+out:
+ put_page(page);
+ return ret;
}
static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
@@ -924,9 +863,7 @@ static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
struct btrfsic_stack_frame *sf;
sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (NULL == sf)
- pr_info("btrfsic: alloc memory failed!\n");
- else
+ if (sf)
sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
return sf;
}
@@ -962,7 +899,7 @@ static noinline_for_stack int btrfsic_process_metablock(
sf->prev = NULL;
continue_with_new_stack_frame:
- sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ sf->block->generation = btrfs_stack_header_generation(sf->hdr);
if (0 == sf->hdr->level) {
struct btrfs_leaf *const leafhdr =
(struct btrfs_leaf *)sf->hdr;
@@ -998,9 +935,10 @@ continue_with_current_leaf_stack_frame:
if (disk_item_offset + sizeof(struct btrfs_item) >
sf->block_ctx->len) {
leaf_item_out_of_bounce_error:
- pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(sf->block_ctx,
@@ -1118,9 +1056,10 @@ continue_with_current_node_stack_frame:
(uintptr_t)nodehdr;
if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
sf->block_ctx->len) {
- pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+ pr_info(
+ "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
sf->block_ctx->start,
- sf->block_ctx->dev->name);
+ sf->block_ctx->dev->bdev);
goto one_stack_frame_backwards;
}
btrfsic_read_from_block_data(
@@ -1288,15 +1227,17 @@ static int btrfsic_create_link_to_next_block(
if (next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr))
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block),
next_block->logical_bytenr);
else
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
+ pr_info(
+ "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ next_bytenr, next_block_ctx->dev->bdev,
next_block_ctx->dev_bytenr, *mirror_nump,
btrfsic_get_block_type(state,
next_block));
@@ -1316,7 +1257,6 @@ static int btrfsic_create_link_to_next_block(
if (NULL == l) {
l = btrfsic_block_link_alloc();
if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(next_block_ctx);
*next_blockp = NULL;
return -1;
@@ -1385,8 +1325,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset +
offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
@@ -1405,8 +1345,8 @@ static int btrfsic_handle_extent_data(
if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+ block_ctx->start, block_ctx->dev->bdev);
return -1;
}
btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1473,7 +1413,6 @@ static int btrfsic_handle_extent_data(
mirror_num,
&block_was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&next_block_ctx);
return -1;
}
@@ -1483,9 +1422,10 @@ static int btrfsic_handle_extent_data(
next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
- pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+ pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
next_bytenr,
- next_block_ctx.dev->name,
+ next_block_ctx.dev->bdev,
next_block_ctx.dev_bytenr,
mirror_num,
next_block->logical_bytenr);
@@ -1517,7 +1457,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfs_fs_info *fs_info = state->fs_info;
int ret;
u64 length;
- struct btrfs_bio *multi = NULL;
+ struct btrfs_io_context *multi = NULL;
struct btrfs_device *device;
length = len;
@@ -1568,12 +1508,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->pagev);
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
+ /* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
- if (block_ctx->datav[num_pages]) {
- kunmap(block_ctx->pagev[num_pages]);
+ if (block_ctx->datav[num_pages])
block_ctx->datav[num_pages] = NULL;
- }
if (block_ctx->pagev[num_pages]) {
__free_page(block_ctx->pagev[num_pages]);
block_ctx->pagev[num_pages] = NULL;
@@ -1613,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- for (i = 0; i < num_pages; i++) {
- block_ctx->pagev[i] = alloc_page(GFP_NOFS);
- if (!block_ctx->pagev[i])
- return -1;
- }
+ ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
+ if (ret)
+ return ret;
dev_bytenr = block_ctx->dev_bytenr;
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
- bio = btrfs_io_bio_alloc(num_pages - i);
- bio_set_dev(bio, block_ctx->dev->bdev);
+ bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
+ REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
- bio->bi_opf = REQ_OP_READ;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1640,8 +1576,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %s!\n",
- block_ctx->start, block_ctx->dev->name);
+ pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+ block_ctx->start, block_ctx->dev->bdev);
bio_put(bio);
return -1;
}
@@ -1650,7 +1586,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
return block_ctx->len;
}
@@ -1665,33 +1601,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
const struct btrfsic_block_link *l;
- pr_info("%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("%c-block @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
}
@@ -1733,7 +1671,7 @@ static noinline_for_stack int btrfsic_test_for_metadata(
crypto_shash_update(shash, data, sublen);
}
crypto_shash_final(shash, csum);
- if (memcmp(csum, h->csum, state->csum_size))
+ if (memcmp(csum, h->csum, fs_info->csum_size))
return 1;
return 0; /* is metadata */
@@ -1743,8 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- struct buffer_head *bh,
- int submit_bio_bh_rw)
+ blk_opf_t submit_bio_bh_rw)
{
int is_metadata;
struct btrfsic_block *block;
@@ -1807,16 +1744,18 @@ again:
if (block->logical_bytenr != bytenr &&
!(!block->is_metadata &&
block->logical_bytenr == 0))
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
- bytenr, dev_state->name,
+ pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+ bytenr, dev_state->bdev,
dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state,
block),
block->logical_bytenr);
else
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev,
dev_bytenr, block->mirror_num,
btrfsic_get_block_type(state,
block));
@@ -1831,8 +1770,9 @@ again:
processed_len = state->datablock_size;
bytenr = block->logical_bytenr;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
+ pr_info(
+ "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+ bytenr, dev_state->bdev, dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state, block));
}
@@ -1842,9 +1782,10 @@ again:
list_empty(&block->ref_to_list) ? ' ' : '!',
list_empty(&block->ref_from_list) ? ' ' : '!');
if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_disk_key_objectid(&block->disk_key),
block->disk_key.type,
@@ -1856,9 +1797,10 @@ again:
}
if (!block->is_iodone && !block->never_written) {
- pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
btrfsic_get_block_type(state, block), bytenr,
- dev_state->name, dev_bytenr, block->mirror_num,
+ dev_state->bdev, dev_bytenr, block->mirror_num,
block->generation,
btrfs_stack_header_generation(
(struct btrfs_header *)
@@ -1902,9 +1844,9 @@ again:
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
- block->orig_bio_bh_private =
+ block->orig_bio_private =
bio->bi_private;
- block->orig_bio_bh_end_io.bio =
+ block->orig_bio_end_io =
bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
@@ -1916,25 +1858,17 @@ again:
bio->bi_private;
BUG_ON(NULL == chained_block);
- block->orig_bio_bh_private =
- chained_block->orig_bio_bh_private;
- block->orig_bio_bh_end_io.bio =
- chained_block->orig_bio_bh_end_io.
- bio;
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
- } else if (NULL != bh) {
- block->is_iodone = 0;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
- block->orig_bio_bh_private = NULL;
- block->orig_bio_bh_end_io.bio = NULL;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
}
@@ -1993,8 +1927,9 @@ again:
if (!is_metadata) {
processed_len = state->datablock_size;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
- dev_state->name, dev_bytenr);
+ pr_info(
+ "written block (%pg/%llu/?) !found in hash table, D\n",
+ dev_state->bdev, dev_bytenr);
if (!state->include_extent_data) {
/* ignore that written D block */
goto continue_loop;
@@ -2011,8 +1946,9 @@ again:
btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
dev_bytenr);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+ "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+ bytenr, dev_state->bdev, dev_bytenr);
}
block_ctx.dev = dev_state;
@@ -2025,7 +1961,6 @@ again:
block = btrfsic_block_alloc();
if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&block_ctx);
goto continue_loop;
}
@@ -2042,8 +1977,8 @@ again:
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
- block->orig_bio_bh_private = bio->bi_private;
- block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
@@ -2054,30 +1989,23 @@ again:
bio->bi_private;
BUG_ON(NULL == chained_block);
- block->orig_bio_bh_private =
- chained_block->orig_bio_bh_private;
- block->orig_bio_bh_end_io.bio =
- chained_block->orig_bio_bh_end_io.bio;
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
- } else if (NULL != bh) {
- block->is_iodone = 0;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
- block->orig_bio_bh_private = NULL;
- block->orig_bio_bh_end_io.bio = NULL;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+ pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2102,7 +2030,7 @@ continue_loop:
static void btrfsic_bio_end_io(struct bio *bp)
{
- struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ struct btrfsic_block *block = bp->bi_private;
int iodone_w_error;
/* mutex is not held! This is not save if IO is not yet completed
@@ -2112,8 +2040,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
iodone_w_error = 1;
BUG_ON(NULL == block);
- bp->bi_private = block->orig_bio_bh_private;
- bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+ bp->bi_private = block->orig_bio_private;
+ bp->bi_end_io = block->orig_bio_end_io;
do {
struct btrfsic_block *next_block;
@@ -2121,10 +2049,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
bp->bi_status,
btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, block->mirror_num);
next_block = block->next_in_same_bio;
block->iodone_w_error = iodone_w_error;
@@ -2132,8 +2060,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
dev_state->last_flush_gen++;
if ((dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %s flush_gen=%llu\n",
- dev_state->name,
+ pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+ dev_state->bdev,
dev_state->last_flush_gen);
}
if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2146,38 +2074,6 @@ static void btrfsic_bio_end_io(struct bio *bp)
bp->bi_end_io(bp);
}
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
-{
- struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
- int iodone_w_error = !uptodate;
- struct btrfsic_dev_state *dev_state;
-
- BUG_ON(NULL == block);
- dev_state = block->dev_state;
- if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
- iodone_w_error,
- btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, block->dev_state->name,
- block->dev_bytenr, block->mirror_num);
-
- block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
- dev_state->last_flush_gen++;
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bh_end_io() new %s flush_gen=%llu\n",
- dev_state->name, dev_state->last_flush_gen);
- }
- if (block->submit_bio_bh_rw & REQ_FUA)
- block->flush_gen = 0; /* FUA completed means block is on disk */
-
- bh->b_private = block->orig_bio_bh_private;
- bh->b_end_io = block->orig_bio_bh_end_io.bh;
- block->is_iodone = 1; /* for FLUSH, this releases the block */
- bh->b_end_io(bh, uptodate);
-}
-
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const superblock,
@@ -2190,17 +2086,19 @@ static int btrfsic_process_written_superblock(
if (!(superblock->generation > state->max_superblock_generation ||
0 == state->max_superblock_generation)) {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+ pr_info(
+ "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
} else {
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+ pr_info(
+ "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
superblock->logical_bytenr,
- superblock->dev_state->name,
+ superblock->dev_state->bdev,
superblock->dev_bytenr, superblock->mirror_num,
btrfs_super_generation(super_hdr),
state->max_superblock_generation);
@@ -2285,7 +2183,6 @@ static int btrfsic_process_written_superblock(
mirror_num,
&was_created);
if (NULL == next_block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
btrfsic_release_block_ctx(&tmp_next_block_ctx);
return -1;
}
@@ -2345,38 +2242,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
*/
list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
if (l->block_ref_to->never_written) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (!l->block_ref_to->is_iodone) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
} else if (l->block_ref_to->iodone_w_error) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
ret = -1;
@@ -2386,10 +2287,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
l->parent_generation &&
BTRFSIC_GENERATION_UNKNOWN !=
l->block_ref_to->generation) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num,
l->block_ref_to->generation,
@@ -2397,10 +2299,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
ret = -1;
} else if (l->block_ref_to->flush_gen >
l->block_ref_to->dev_state->last_flush_gen) {
- pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+ pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name,
+ l->block_ref_to->dev_state->bdev,
l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num, block->flush_gen,
l->block_ref_to->dev_state->last_flush_gen);
@@ -2437,15 +2340,16 @@ static int btrfsic_is_block_ref_by_superblock(
*/
list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ pr_info(
+ "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
recursion_level,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num,
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr,
l->block_ref_from->mirror_num);
if (l->block_ref_from->is_superblock &&
@@ -2467,30 +2371,30 @@ static int btrfsic_is_block_ref_by_superblock(
static void btrfsic_print_add_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
static void btrfsic_print_rem_link(const struct btrfsic_state *state,
const struct btrfsic_block_link *l)
{
- pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+ pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
l->ref_cnt,
btrfsic_get_block_type(state, l->block_ref_from),
l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->name,
+ l->block_ref_from->dev_state->bdev,
l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
btrfsic_get_block_type(state, l->block_ref_to),
l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+ l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
l->block_ref_to->mirror_num);
}
@@ -2532,9 +2436,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+ indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->name,
+ block->logical_bytenr, block->dev_state->bdev,
block->dev_bytenr, block->mirror_num);
if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
printk("[...]\n");
@@ -2593,10 +2497,8 @@ static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
&state->block_link_hashtable);
if (NULL == l) {
l = btrfsic_block_link_alloc();
- if (NULL == l) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!l)
return NULL;
- }
l->block_ref_to = next_block;
l->block_ref_from = from_block;
@@ -2640,10 +2542,9 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
struct btrfsic_dev_state *dev_state;
block = btrfsic_block_alloc();
- if (NULL == block) {
- pr_info("btrfsic: error, kmalloc failed!\n");
+ if (!block)
return NULL;
- }
+
dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
if (NULL == dev_state) {
pr_info("btrfsic: error, lookup dev_state failed!\n");
@@ -2658,10 +2559,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
block->never_written = never_written;
block->mirror_num = mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+ pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
additional_string,
btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->name,
+ block->logical_bytenr, dev_state->bdev,
block->dev_bytenr, mirror_num);
list_add(&block->all_blocks_node, &state->all_blocks_list);
btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2708,8 +2609,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
}
if (WARN_ON(!match)) {
- pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
- bytenr, dev_state->name, dev_bytenr);
+ pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+ bytenr, dev_state->bdev, dev_bytenr);
for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
ret = btrfsic_map_block(state, bytenr,
state->metablock_size,
@@ -2717,8 +2619,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
if (ret)
continue;
- pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
- bytenr, block_ctx.dev->name,
+ pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+ bytenr, block_ctx.dev->bdev,
block_ctx.dev_bytenr, mirror_num);
}
}
@@ -2730,161 +2632,95 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
-int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
+static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- struct btrfsic_dev_state *dev_state;
+ unsigned int segs = bio_segments(bio);
+ u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ u64 cur_bytenr = dev_bytenr;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ char **mapped_datav;
+ int bio_is_patched = 0;
+ int i = 0;
+
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info(
+"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, segs,
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
+
+ mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
+ return;
- if (!btrfsic_is_initialized)
- return submit_bh(op, op_flags, bh);
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = page_address(bvec.bv_page);
+ i++;
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bh() might also be called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev);
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
+ }
- /* Only called to write the superblock (incl. FLUSH/FUA) */
- if (NULL != dev_state &&
- (op == REQ_OP_WRITE) && bh->b_size > 0) {
- u64 dev_bytenr;
+ btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
+ bio, &bio_is_patched, bio->bi_opf);
+ kfree(mapped_datav);
+}
- dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n",
- op, op_flags, (unsigned long long)bh->b_blocknr,
- dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
- btrfsic_process_written_block(dev_state, dev_bytenr,
- &bh->b_data, 1, NULL,
- NULL, bh, op_flags);
- } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n",
- op, op_flags, bh->b_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
+static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
+{
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = op_flags;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
- }
+ if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = bio->bi_opf;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ } else if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE))) {
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
}
- mutex_unlock(&btrfsic_mutex);
- return submit_bh(op, op_flags, bh);
}
-static void __btrfsic_submit_bio(struct bio *bio)
+void btrfsic_check_bio(struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
if (!btrfsic_is_initialized)
return;
+ /*
+ * We can be called before btrfsic_mount, so there might not be a
+ * dev_state.
+ */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bio() is also called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
- if (NULL != dev_state &&
- (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- unsigned int i = 0;
- u64 dev_bytenr;
- u64 cur_bytenr;
- struct bio_vec bvec;
- struct bvec_iter iter;
- int bio_is_patched;
- char **mapped_datav;
- unsigned int segs = bio_segments(bio);
-
- dev_bytenr = 512 * bio->bi_iter.bi_sector;
- bio_is_patched = 0;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- (unsigned long long)bio->bi_iter.bi_sector,
- dev_bytenr, bio->bi_disk);
-
- mapped_datav = kmalloc_array(segs,
- sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- goto leave;
- cur_bytenr = dev_bytenr;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, segs,
- bio, &bio_is_patched,
- NULL, bio->bi_opf);
- bio_for_each_segment(bvec, bio, iter)
- kunmap(bvec.bv_page);
- kfree(mapped_datav);
- } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_disk);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
-
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_bh_private = bio->bi_private;
- block->orig_bio_bh_end_io.bio = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- }
+ if (dev_state) {
+ if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
+ btrfsic_check_write_bio(bio, dev_state);
+ else if (bio->bi_opf & REQ_PREFLUSH)
+ btrfsic_check_flush_bio(bio, dev_state);
}
-leave:
mutex_unlock(&btrfsic_mutex);
}
-void btrfsic_submit_bio(struct bio *bio)
-{
- __btrfsic_submit_bio(bio);
- submit_bio(bio);
-}
-
-int btrfsic_submit_bio_wait(struct bio *bio)
-{
- __btrfsic_submit_bio(bio);
- return submit_bio_wait(bio);
-}
-
int btrfsic_mount(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices,
int including_extent_data, u32 print_mask)
@@ -2905,10 +2741,8 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
return -1;
}
state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state) {
- pr_info("btrfs check-integrity: allocation failed!\n");
+ if (!state)
return -ENOMEM;
- }
if (!btrfsic_is_initialized) {
mutex_init(&btrfsic_mutex);
@@ -2919,7 +2753,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
state->fs_info = fs_info;
state->print_mask = print_mask;
state->include_extent_data = including_extent_data;
- state->csum_size = 0;
state->metablock_size = fs_info->nodesize;
state->datablock_size = fs_info->sectorsize;
INIT_LIST_HEAD(&state->all_blocks_list);
@@ -2930,23 +2763,17 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
list_for_each_entry(device, dev_head, dev_list) {
struct btrfsic_dev_state *ds;
- const char *p;
if (!device->bdev || !device->name)
continue;
ds = btrfsic_dev_state_alloc();
if (NULL == ds) {
- pr_info("btrfs check-integrity: kmalloc() failed!\n");
mutex_unlock(&btrfsic_mutex);
return -ENOMEM;
}
ds->bdev = device->bdev;
ds->state = state;
- bdevname(ds->bdev, ds->name);
- ds->name[BDEVNAME_SIZE - 1] = '\0';
- p = kbasename(ds->name);
- strlcpy(ds->name, p, sizeof(ds->name));
btrfsic_dev_state_hashtable_add(ds,
&btrfsic_dev_state_hashtable);
}
@@ -3024,9 +2851,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
if (b_all->is_iodone || b_all->never_written)
btrfsic_block_free(b_all);
else
- pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+ pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->name,
+ b_all->logical_bytenr, b_all->dev_state->bdev,
b_all->dev_bytenr, b_all->mirror_num);
}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 9bf4359cc44c..e4c8aed7996f 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,13 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);
-void btrfsic_submit_bio(struct bio *bio);
-int btrfsic_submit_bio_wait(struct bio *bio);
+void btrfsic_check_bio(struct bio *bio);
#else
-#define btrfsic_submit_bh submit_bh
-#define btrfsic_submit_bio submit_bio
-#define btrfsic_submit_bio_wait submit_bio_wait
+static inline void btrfsic_check_bio(struct bio *bio) { }
#endif
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9ab610cc9114..e6635fe70067 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -8,12 +8,15 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/highmem.h>
+#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
+#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
@@ -28,41 +31,8 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
-
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
-void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
-
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
-void lzo_free_workspace(struct list_head *ws);
-
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
+#include "subpage.h"
+#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -115,18 +85,23 @@ static int compression_compress_pages(int type, struct list_head *ws,
case BTRFS_COMPRESS_NONE:
default:
/*
- * This can't happen, the type is validated several times
- * before we get here. As a sane fallback, return what the
- * callers will understand as 'no compression happened'.
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
*/
+ *out_pages = 0;
return -E2BIG;
}
}
-static int compression_decompress_bio(int type, struct list_head *ws,
- struct compressed_bio *cb)
+static int compression_decompress_bio(struct list_head *ws,
+ struct compressed_bio *cb)
{
- switch (type) {
+ switch (cb->compress_type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
@@ -163,146 +138,76 @@ static int compression_decompress(int type, struct list_head *ws,
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
- unsigned long disk_size)
-{
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
-}
-
-static int check_compressed_csum(struct btrfs_inode *inode,
- struct compressed_bio *cb,
- u64 disk_start)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int ret;
- struct page *page;
- unsigned long i;
- char *kaddr;
- u8 csum[BTRFS_CSUM_SIZE];
- u8 *cb_sum = cb->sums;
-
- if (inode->flags & BTRFS_INODE_NODATASUM)
- return 0;
-
- shash->tfm = fs_info->csum_shash;
-
- for (i = 0; i < cb->nr_pages; i++) {
- page = cb->compressed_pages[i];
-
- crypto_shash_init(shash);
- kaddr = kmap_atomic(page);
- crypto_shash_update(shash, kaddr, PAGE_SIZE);
- kunmap_atomic(kaddr);
- crypto_shash_final(shash, (u8 *)&csum);
-
- if (memcmp(&csum, cb_sum, csum_size)) {
- btrfs_print_data_csum_error(inode, disk_start,
- csum, cb_sum, cb->mirror_num);
- ret = -EIO;
- goto fail;
- }
- cb_sum += csum_size;
-
- }
- ret = 0;
-fail:
- return ret;
-}
-
-/* when we finish reading compressed pages from the disk, we
- * decompress them and then run the bio end_io routines on the
- * decompressed pages (in the inode address space).
- *
- * This allows the checksumming and other IO error handling routines
- * to work normally
- *
- * The compressed pages are freed here, and it must be run
- * in process context
- */
-static void end_compressed_bio_read(struct bio *bio)
+static void finish_compressed_bio_read(struct compressed_bio *cb)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
+ unsigned int index;
struct page *page;
- unsigned long index;
- unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
- int ret = 0;
-
- if (bio->bi_status)
- cb->errors = 1;
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
- goto out;
+ if (cb->status == BLK_STS_OK)
+ cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
- /*
- * Record the correct mirror_num in cb->orig_bio so that
- * read-repair can work properly.
- */
- ASSERT(btrfs_io_bio(cb->orig_bio));
- btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
- cb->mirror_num = mirror;
-
- /*
- * Some IO in this cb have failed, just skip checksum as there
- * is no way it could be correct.
- */
- if (cb->errors == 1)
- goto csum_failed;
-
- inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), cb,
- (u64)bio->bi_iter.bi_sector << 9);
- if (ret)
- goto csum_failed;
-
- /* ok, we're the last bio for this extent, lets start
- * the decompression.
- */
- ret = btrfs_decompress_bio(cb);
-
-csum_failed:
- if (ret)
- cb->errors = 1;
-
- /* release the compressed pages */
- index = 0;
+ /* Release the compressed pages */
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
put_page(page);
}
- /* do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ /* Do io completion on the original bio */
+ btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
- /*
- * we have verified the checksum already, set page
- * checked so the end_io handlers know about it
- */
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
- SetPageChecked(bvec->bv_page);
+ /* Finally free the cb struct */
+ kfree(cb->compressed_pages);
+ kfree(cb);
+}
- bio_endio(cb->orig_bio);
+/*
+ * Verify the checksums and kick off repair if needed on the uncompressed data
+ * before decompressing it into the original bio and freeing the uncompressed
+ * pages.
+ */
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
+{
+ struct compressed_bio *cb = bbio->private;
+ struct inode *inode = cb->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ blk_status_t status = bbio->bio.bi_status;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (!status &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
+ btrfs_clean_io_failure(bi, start, bv.bv_page,
+ bv.bv_offset);
+ } else {
+ int ret;
+
+ refcount_inc(&cb->pending_ios);
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ btrfs_submit_data_read_bio);
+ if (ret) {
+ refcount_dec(&cb->pending_ios);
+ status = errno_to_blk_status(ret);
+ }
+ }
}
- /* finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
-out:
- bio_put(bio);
+ if (status)
+ cb->status = status;
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ btrfs_bio_free_csum(bbio);
+ bio_put(&bbio->bio);
}
/*
@@ -312,90 +217,151 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
- struct page *pages[16];
- unsigned long nr_pages = end_index - index + 1;
+ struct folio_batch fbatch;
+ const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
- if (cb->errors)
- mapping_set_error(inode->i_mapping, -EIO);
+ if (errno)
+ mapping_set_error(inode->i_mapping, errno);
+
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ ret = filemap_get_folios(inode->i_mapping, &index, end_index,
+ &fbatch);
+
+ if (ret == 0)
+ return;
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- nr_pages -= 1;
- index += 1;
- continue;
- }
for (i = 0; i < ret; i++) {
- if (cb->errors)
- SetPageError(pages[i]);
- end_page_writeback(pages[i]);
- put_page(pages[i]);
+ struct folio *folio = fbatch.folios[i];
+
+ if (errno)
+ folio_set_error(folio);
+ btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
+ cb->start, cb->len);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
}
/* the inode may be gone now */
}
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- struct page *page;
- unsigned long index;
-
- if (bio->bi_status)
- cb->errors = 1;
+ struct inode *inode = cb->inode;
+ unsigned int index;
- /* if there are more bios still pending for this compressed
- * extent, just exit
- */
- if (!refcount_dec_and_test(&cb->pending_bios))
- goto out;
-
- /* ok, we're the last bio for this extent, step one is to
- * call back into the FS and do all the end_io operations
+ /*
+ * Ok, we're the last bio for this extent, step one is to call back
+ * into the FS and do all the end_io operations.
*/
- inode = cb->inode;
- cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
- btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- bio->bi_status == BLK_STS_OK);
- cb->compressed_pages[0]->mapping = NULL;
+ cb->status == BLK_STS_OK);
- end_compressed_writeback(inode, cb);
- /* note, our inode could be gone now */
+ if (cb->writeback)
+ end_compressed_writeback(inode, cb);
+ /* Note, our inode could be gone now */
/*
- * release the compressed pages, these came from alloc_page and
+ * Release the compressed pages, these came from alloc_page and
* are not attached to the inode at all
*/
- index = 0;
for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
+ struct page *page = cb->compressed_pages[index];
+
page->mapping = NULL;
put_page(page);
}
- /* finally free the cb struct */
+ /* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
-out:
- bio_put(bio);
+}
+
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk. This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
+{
+ struct compressed_bio *cb = bbio->private;
+
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
+
+ if (refcount_dec_and_test(&cb->pending_ios)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+
+ btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ }
+ bio_put(&bbio->bio);
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb: The compressed_bio structure, which records all the needed
+ * information to bind the compressed data to the uncompressed
+ * page cache.
+ * @disk_byten: The logical bytenr where the compressed data will be read
+ * from or written to.
+ * @endio_func: The endio function to call after the IO for compressed data
+ * is finished.
+ * @next_stripe_start: Return value of logical bytenr of where next stripe starts.
+ * Let the caller know to only fill the bio up to the stripe
+ * boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+ blk_opf_t opf,
+ btrfs_bio_end_io_t endio_func,
+ u64 *next_stripe_start)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct btrfs_io_geometry geom;
+ struct extent_map *em;
+ struct bio *bio;
+ int ret;
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+ if (IS_ERR(em)) {
+ bio_put(bio);
+ return ERR_CAST(em);
+ }
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ bio_put(bio);
+ return ERR_PTR(ret);
+ }
+ *next_stripe_start = disk_bytenr + geom.len;
+ refcount_inc(&cb->pending_ios);
+ return bio;
}
/*
@@ -407,122 +373,122 @@ out:
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
- unsigned long len, u64 disk_start,
- unsigned long compressed_len,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages,
- unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css)
+ unsigned int nr_pages,
+ blk_opf_t write_flags,
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
- unsigned long bytes_left;
- int pg_index = 0;
- struct page *page;
- u64 first_byte = disk_start;
- blk_status_t ret;
- int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
- WARN_ON(!PAGE_ALIGNED(start));
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ u64 cur_disk_bytenr = disk_start;
+ u64 next_stripe_start;
+ blk_status_t ret = BLK_STS_OK;
+ int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
+ const bool use_append = btrfs_use_zone_append(inode, disk_start);
+ const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_bios, 0);
- cb->errors = 0;
- cb->inode = inode;
+ refcount_set(&cb->pending_ios, 1);
+ cb->status = BLK_STS_OK;
+ cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
- cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
- cb->orig_bio = NULL;
+ cb->writeback = writeback;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
-
- if (blkcg_css) {
- bio->bi_opf |= REQ_CGROUP_PUNT;
+ if (blkcg_css)
kthread_associate_blkcg(blkcg_css);
- }
- refcount_set(&cb->pending_bios, 1);
- /* create and submit bios for the compressed pages */
- bytes_left = compressed_len;
- for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
- int submit = 0;
-
- page = compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
- if (bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
- 0);
-
- page->mapping = NULL;
- if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
- /*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
- */
- refcount_inc(&cb->pending_bios);
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
-
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
-
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ while (cur_disk_bytenr < disk_start + compressed_len) {
+ u64 offset = cur_disk_bytenr - disk_start;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!bio) {
+ bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+ bio_op | write_flags, end_compressed_bio_write,
+ &next_stripe_start);
+ if (IS_ERR(bio)) {
+ ret = errno_to_blk_status(PTR_ERR(bio));
+ break;
}
-
- bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
- bio->bi_private = cb;
- bio->bi_end_io = end_compressed_bio_write;
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
- bio_add_page(bio, page, PAGE_SIZE, 0);
- }
- if (bytes_left < PAGE_SIZE) {
- btrfs_info(fs_info,
- "bytes left %lu compress len %lu nr %lu",
- bytes_left, cb->compressed_len, cb->nr_pages);
}
- bytes_left -= PAGE_SIZE;
- first_byte += PAGE_SIZE;
- cond_resched();
- }
-
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_bytenr != next_stripe_start);
- if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
- BUG_ON(ret); /* -ENOMEM */
- }
+ /*
+ * We have various limits on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+ if (use_append)
+ added = bio_add_zone_append_page(bio, page, real_size,
+ offset_in_page(offset));
+ else
+ added = bio_add_page(bio, page, real_size,
+ offset_in_page(offset));
+ /* Reached zoned boundary */
+ if (added == 0)
+ submit = true;
+
+ cur_disk_bytenr += added;
+ /* Reached stripe boundary */
+ if (cur_disk_bytenr == next_stripe_start)
+ submit = true;
+
+ /* Finished the range */
+ if (cur_disk_bytenr == disk_start + compressed_len)
+ submit = true;
+
+ if (submit) {
+ if (!skip_sum) {
+ ret = btrfs_csum_one_bio(inode, bio, start, true);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ break;
+ }
+ }
- ret = btrfs_map_bio(fs_info, bio, 0);
- if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, bio, 0);
+ bio = NULL;
+ }
+ cond_resched();
}
if (blkcg_css)
kthread_associate_blkcg(NULL);
- return 0;
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_write(cb);
+ return ret;
}
static u64 bio_end_offset(struct bio *bio)
@@ -532,45 +498,75 @@ static u64 bio_end_offset(struct bio *bio)
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
- struct compressed_bio *cb)
+ struct compressed_bio *cb,
+ int *memstall, unsigned long *pflags)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
- unsigned long pg_index;
- u64 last_offset;
+ u64 cur = bio_end_offset(cb->orig_bio);
u64 isize = i_size_read(inode);
int ret;
struct page *page;
- unsigned long nr_pages = 0;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
- u64 end;
- int misses = 0;
+ int sectors_missed = 0;
- last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
if (isize == 0)
return 0;
+ /*
+ * For current subpage support, we only support 64K page size,
+ * which means maximum compressed extent size (128K) is just 2x page
+ * size.
+ * This makes readahead less effective, so here disable readahead for
+ * subpage for now, until full compressed write is supported.
+ */
+ if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+ return 0;
+
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_SHIFT;
+ while (cur < compressed_end) {
+ u64 page_end;
+ u64 pg_index = cur >> PAGE_SHIFT;
+ u32 add_size;
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
- misses++;
- if (misses > 4)
+ sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ fs_info->sectorsize_bits;
+
+ /* Beyond threshold, no need to continue */
+ if (sectors_missed > 4)
break;
- goto next;
+
+ /*
+ * Jump to next page start as we already have page for
+ * current offset.
+ */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -580,27 +576,39 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
- goto next;
+ /* There is already a page, skip to page end */
+ cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ continue;
}
- end = last_offset + PAGE_SIZE - 1;
- /*
- * at this point, we have a locked page in the page cache
- * for these bytes in the file. But, we have to make
- * sure they map to this compressed extent on disk.
- */
- set_page_extent_mapped(page);
- lock_extent(tree, last_offset, end);
+ if (!*memstall && PageWorkingset(page)) {
+ psi_memstall_enter(pflags);
+ *memstall = 1;
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
+ page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_SIZE);
+ em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
- if (!em || last_offset < em->start ||
- (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+ /*
+ * At this point, we have a locked page in the page cache for
+ * these bytes in the file. But, we have to make sure they map
+ * to this compressed extent on disk.
+ */
+ if (!em || cur < em->start ||
+ (cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, last_offset, end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -608,33 +616,32 @@ static noinline int add_ra_bio_pages(struct inode *inode,
free_extent_map(em);
if (page->index == end_index) {
- char *userpage;
size_t zero_offset = offset_in_page(isize);
if (zero_offset) {
int zeros;
zeros = PAGE_SIZE - zero_offset;
- userpage = kmap_atomic(page);
- memset(userpage + zero_offset, 0, zeros);
- flush_dcache_page(page);
- kunmap_atomic(userpage);
+ memzero_page(page, zero_offset, zeros);
}
}
- ret = bio_add_page(cb->orig_bio, page,
- PAGE_SIZE, 0);
-
- if (ret == PAGE_SIZE) {
- nr_pages++;
- put_page(page);
- } else {
- unlock_extent(tree, last_offset, end);
+ add_size = min(em->start + em->len, page_end + 1) - cur;
+ ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ if (ret != add_size) {
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
}
-next:
- last_offset += PAGE_SIZE;
+ /*
+ * If it's subpage, we also need to increase its
+ * subpage::readers number, as at endio we will decrease
+ * subpage::readers and to unlock the page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ put_page(page);
+ cur += add_size;
}
return 0;
}
@@ -650,171 +657,179 @@ next:
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
- unsigned long compressed_len;
- unsigned long nr_pages;
- unsigned long pg_index;
- struct page *page;
- struct bio *comp_bio;
- u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
+ unsigned int compressed_len;
+ struct bio *comp_bio = NULL;
+ const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_byte = disk_bytenr;
+ u64 next_stripe_start;
+ u64 file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
- blk_status_t ret = BLK_STS_RESOURCE;
- int faili = 0;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- u8 *sums;
+ unsigned long pflags;
+ int memstall = 0;
+ blk_status_t ret;
+ int ret2;
+ int i;
em_tree = &BTRFS_I(inode)->extent_tree;
+ file_offset = bio_first_bvec_all(bio)->bv_offset +
+ page_offset(bio_first_page_all(bio));
+
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree,
- page_offset(bio_first_page_all(bio)),
- PAGE_SIZE);
+ em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
- if (!em)
- return BLK_STS_IOERR;
+ if (!em) {
+ ret = BLK_STS_IOERR;
+ goto out;
+ }
+ ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
- if (!cb)
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
+ if (!cb) {
+ ret = BLK_STS_RESOURCE;
goto out;
+ }
- refcount_set(&cb->pending_bios, 0);
- cb->errors = 0;
+ refcount_set(&cb->pending_ios, 1);
+ cb->status = BLK_STS_OK;
cb->inode = inode;
- cb->mirror_num = mirror_num;
- sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
em_start = em->start;
- free_extent_map(em);
- em = NULL;
-
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_compress_type(bio_flags);
+ cb->compress_type = em->compress_type;
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_NOFS);
- if (!cb->compressed_pages)
- goto fail1;
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
- __GFP_HIGHMEM);
- if (!cb->compressed_pages[pg_index]) {
- faili = pg_index - 1;
- ret = BLK_STS_RESOURCE;
- goto fail2;
- }
+ free_extent_map(em);
+ em = NULL;
+
+ cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!cb->compressed_pages) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
}
- faili = nr_pages - 1;
- cb->nr_pages = nr_pages;
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ if (ret2) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
+ }
+
+ add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
- refcount_set(&cb->pending_bios, 1);
-
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- int submit = 0;
+ while (cur_disk_byte < disk_bytenr + compressed_len) {
+ u64 offset = cur_disk_byte - disk_bytenr;
+ unsigned int index = offset >> PAGE_SHIFT;
+ unsigned int real_size;
+ unsigned int added;
+ struct page *page = cb->compressed_pages[index];
+ bool submit = false;
+
+ /* Allocate new bio if submitted or not yet allocated */
+ if (!comp_bio) {
+ comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+ REQ_OP_READ, end_compressed_bio_read,
+ &next_stripe_start);
+ if (IS_ERR(comp_bio)) {
+ cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
+ break;
+ }
+ }
+ /*
+ * We should never reach next_stripe_start start as we will
+ * submit comp_bio when reach the boundary immediately.
+ */
+ ASSERT(cur_disk_byte != next_stripe_start);
+ /*
+ * We have various limit on the real read size:
+ * - stripe boundary
+ * - page boundary
+ * - compressed length boundary
+ */
+ real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+ real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+ real_size = min_t(u64, real_size, compressed_len - offset);
+ ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
- page = cb->compressed_pages[pg_index];
- page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_SHIFT;
+ added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
+ /*
+ * Maximum compressed extent is smaller than bio size limit,
+ * thus bio_add_page() should always success.
+ */
+ ASSERT(added == real_size);
+ cur_disk_byte += added;
- if (comp_bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
- comp_bio, 0);
+ /* Reached stripe boundary, need to submit */
+ if (cur_disk_byte == next_stripe_start)
+ submit = true;
- page->mapping = NULL;
- if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
- unsigned int nr_sectors;
+ /* Has finished the range, need to submit */
+ if (cur_disk_byte == disk_bytenr + compressed_len)
+ submit = true;
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
- BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ if (submit) {
+ /* Save the original iter for read repair */
+ if (bio_op(comp_bio) == REQ_OP_READ)
+ btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
/*
- * inc the count before we submit the bio so
- * we know the end IO handler won't happen before
- * we inc the count. Otherwise, the cb might get
- * freed before we're done setting it up
+ * Save the initial offset of this chunk, as there
+ * is no direct correlation between compressed pages and
+ * the original file offset. The field is only used for
+ * priting error messages.
*/
- refcount_inc(&cb->pending_bios);
-
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio,
- (u64)-1, sums);
- BUG_ON(ret); /* -ENOMEM */
- }
+ btrfs_bio(comp_bio)->file_offset = file_offset;
- nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
- fs_info->sectorsize);
- sums += csum_size * nr_sectors;
-
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
+ btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
+ break;
}
- comp_bio = btrfs_bio_alloc(cur_disk_byte);
- comp_bio->bi_opf = REQ_OP_READ;
- comp_bio->bi_private = cb;
- comp_bio->bi_end_io = end_compressed_bio_read;
-
- bio_add_page(comp_bio, page, PAGE_SIZE, 0);
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, comp_bio, mirror_num);
+ comp_bio = NULL;
}
- cur_disk_byte += PAGE_SIZE;
}
- ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
- BUG_ON(ret); /* -ENOMEM */
+ if (memstall)
+ psi_memstall_leave(&pflags);
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
- BUG_ON(ret); /* -ENOMEM */
- }
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ return;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
- if (ret) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
-
- return 0;
-
-fail2:
- while (faili >= 0) {
- __free_page(cb->compressed_pages[faili]);
- faili--;
+fail:
+ if (cb->compressed_pages) {
+ for (i = 0; i < cb->nr_pages; i++) {
+ if (cb->compressed_pages[i])
+ __free_page(cb->compressed_pages[i]);
+ }
}
kfree(cb->compressed_pages);
-fail1:
kfree(cb);
out:
free_extent_map(em);
- return ret;
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
}
/*
@@ -1142,6 +1157,22 @@ static void put_workspace(int type, struct list_head *ws)
}
/*
+ * Adjust @level according to the limits of the compression algorithm or
+ * fallback to default
+ */
+static unsigned int btrfs_compress_set_level(int type, unsigned level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ if (level == 0)
+ level = ops->default_level;
+ else
+ level = min(level, ops->max_level);
+
+ return level;
+}
+
+/*
* Given an address space and start and length, compress the bytes into @pages
* that are allocated on demand.
*
@@ -1160,9 +1191,6 @@ static void put_workspace(int type, struct list_head *ws)
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
- *
- * @max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
*/
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
@@ -1183,20 +1211,6 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
return ret;
}
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * orig_bio contains the pages from the file that we want to decompress into
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous. They all correspond to the range of bytes covered by
- * the compressed extent.
- */
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
struct list_head *workspace;
@@ -1204,7 +1218,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = compression_decompress_bio(type, workspace, cb);
+ ret = compression_decompress_bio(workspace, cb);
put_workspace(type, workspace);
return ret;
@@ -1246,98 +1260,81 @@ void __cold btrfs_exit_compress(void)
}
/*
- * Copy uncompressed data from working buffer to pages.
+ * Copy decompressed data from working buffer to pages.
+ *
+ * @buf: The decompressed data buffer
+ * @buf_len: The decompressed data length
+ * @decompressed: Number of bytes that are already decompressed inside the
+ * compressed extent
+ * @cb: The compressed extent descriptor
+ * @orig_bio: The original bio that the caller wants to read for
+ *
+ * An easier to understand graph is like below:
+ *
+ * |<- orig_bio ->| |<- orig_bio->|
+ * |<------- full decompressed extent ----->|
+ * |<----------- @cb range ---->|
+ * | |<-- @buf_len -->|
+ * |<--- @decompressed --->|
*
- * buf_start is the byte offset we're of the start of our workspace buffer.
+ * Note that, @cb can be a subpage of the full decompressed extent, but
+ * @cb->start always has the same as the orig_file_offset value of the full
+ * decompressed extent.
*
- * total_out is the last byte of the buffer
+ * When reading compressed extent, we have to read the full compressed extent,
+ * while @orig_bio may only want part of the range.
+ * Thus this function will ensure only data covered by @orig_bio will be copied
+ * to.
+ *
+ * Return 0 if we have copied all needed contents for @orig_bio.
+ * Return >0 if we need continue decompress.
*/
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
- unsigned long total_out, u64 disk_start,
- struct bio *bio)
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed)
{
- unsigned long buf_offset;
- unsigned long current_buf_start;
- unsigned long start_byte;
- unsigned long prev_start_byte;
- unsigned long working_bytes = total_out - buf_start;
- unsigned long bytes;
- char *kaddr;
- struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
-
- /*
- * start byte is the first byte of the page we're currently
- * copying into relative to the start of the compressed data.
- */
- start_byte = page_offset(bvec.bv_page) - disk_start;
+ struct bio *orig_bio = cb->orig_bio;
+ /* Offset inside the full decompressed extent */
+ u32 cur_offset;
+
+ cur_offset = decompressed;
+ /* The main loop to do the copy */
+ while (cur_offset < decompressed + buf_len) {
+ struct bio_vec bvec;
+ size_t copy_len;
+ u32 copy_start;
+ /* Offset inside the full decompressed extent */
+ u32 bvec_offset;
+
+ bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
+ /*
+ * cb->start may underflow, but subtracting that value can still
+ * give us correct offset inside the full decompressed extent.
+ */
+ bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
- /* we haven't yet hit data corresponding to this page */
- if (total_out <= start_byte)
- return 1;
+ /* Haven't reached the bvec range, exit */
+ if (decompressed + buf_len <= bvec_offset)
+ return 1;
- /*
- * the start of the data we care about is offset into
- * the middle of our working buffer
- */
- if (total_out > start_byte && buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes -= buf_offset;
- } else {
- buf_offset = 0;
- }
- current_buf_start = buf_start;
-
- /* copy bytes from the working buffer into the pages */
- while (working_bytes > 0) {
- bytes = min_t(unsigned long, bvec.bv_len,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, working_bytes);
-
- kaddr = kmap_atomic(bvec.bv_page);
- memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes);
- kunmap_atomic(kaddr);
- flush_dcache_page(bvec.bv_page);
-
- buf_offset += bytes;
- working_bytes -= bytes;
- current_buf_start += bytes;
-
- /* check if we need to pick another page */
- bio_advance(bio, bytes);
- if (!bio->bi_iter.bi_size)
- return 0;
- bvec = bio_iter_iovec(bio, bio->bi_iter);
- prev_start_byte = start_byte;
- start_byte = page_offset(bvec.bv_page) - disk_start;
+ copy_start = max(cur_offset, bvec_offset);
+ copy_len = min(bvec_offset + bvec.bv_len,
+ decompressed + buf_len) - copy_start;
+ ASSERT(copy_len);
/*
- * We need to make sure we're only adjusting
- * our offset into compression working buffer when
- * we're switching pages. Otherwise we can incorrectly
- * keep copying when we were actually done.
+ * Extra range check to ensure we didn't go beyond
+ * @buf + @buf_len.
*/
- if (start_byte != prev_start_byte) {
- /*
- * make sure our new page is covered by this
- * working buffer
- */
- if (total_out <= start_byte)
- return 1;
-
- /*
- * the next page in the biovec might not be adjacent
- * to the last page, but it might still be found
- * inside this working buffer. bump our offset pointer
- */
- if (total_out > start_byte &&
- current_buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes = total_out - start_byte;
- current_buf_start = buf_start + buf_offset;
- }
- }
+ ASSERT(copy_start - decompressed < buf_len);
+ memcpy_to_page(bvec.bv_page, bvec.bv_offset,
+ buf + copy_start - decompressed, copy_len);
+ cur_offset += copy_len;
+
+ bio_advance(orig_bio, copy_len);
+ /* Finished the bio */
+ if (!orig_bio->bi_iter.bi_size)
+ return 0;
}
-
return 1;
}
@@ -1614,7 +1611,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
curr_sample_pos = 0;
while (index < index_end) {
page = find_get_page(inode->i_mapping, index);
- in_data = kmap(page);
+ in_data = kmap_local_page(page);
/* Handle case where the start is not aligned to PAGE_SIZE */
i = start % PAGE_SIZE;
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
@@ -1627,7 +1624,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
start += SAMPLING_INTERVAL;
curr_sample_pos += SAMPLING_READ_SIZE;
}
- kunmap(page);
+ kunmap_local(in_data);
put_page(page);
index++;
@@ -1748,19 +1745,3 @@ unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
return level;
}
-
-/*
- * Adjust @level according to the limits of the compression algorithm or
- * fallback to default
- */
-unsigned int btrfs_compress_set_level(int type, unsigned level)
-{
- const struct btrfs_compress_op *ops = btrfs_compress_op[type];
-
- if (level == 0)
- level = ops->default_level;
- else
- level = min(level, ops->max_level);
-
- return level;
-}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d253f7aa8ed5..1aa02903de69 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -8,6 +8,8 @@
#include <linux/sizes.h>
+struct btrfs_inode;
+
/*
* We want to make sure that amount of RAM required to uncompress an extent is
* reasonable, so we limit the total size in ram of a compressed extent to
@@ -20,14 +22,19 @@
/* Maximum length of compressed data stored on disk */
#define BTRFS_MAX_COMPRESSED (SZ_128K)
+static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* number of bios pending for this compressed extent */
- refcount_t pending_bios;
+ /* Number of outstanding bios */
+ refcount_t pending_ios;
+
+ /* Number of compressed pages in the array */
+ unsigned int nr_pages;
/* the pages with the compressed data on them */
struct page **compressed_pages;
@@ -38,30 +45,26 @@ struct compressed_bio {
/* starting offset in the inode for our pages */
u64 start;
- /* number of bytes in the inode we're working on */
- unsigned long len;
+ /* Number of bytes in the inode we're working on */
+ unsigned int len;
- /* number of bytes on disk */
- unsigned long compressed_len;
+ /* Number of bytes on disk */
+ unsigned int compressed_len;
- /* the compression algorithm for this bio */
- int compress_type;
+ /* The compression algorithm for this bio */
+ u8 compress_type;
- /* number of compressed pages in the array */
- unsigned long nr_pages;
+ /* Whether this is a write for writeback. */
+ bool writeback;
/* IO errors */
- int errors;
- int mirror_num;
+ blk_status_t status;
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
-
- /*
- * the start of a variable length array of checksums only
- * used by reads
- */
- u8 sums[];
+ union {
+ /* For reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -84,19 +87,19 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_out);
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
- unsigned long total_out, u64 disk_start,
- struct bio *bio);
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed);
-blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
- unsigned long len, u64 disk_start,
- unsigned long compressed_len,
+blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages,
- unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css);
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+ unsigned int nr_pages,
+ blk_opf_t write_flags,
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback);
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
@@ -140,8 +143,41 @@ extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
-unsigned int btrfs_compress_set_level(int type, unsigned level);
-
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f2ec1a9bae28..a9543f01184c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -14,6 +15,8 @@
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
+#include "tree-mod-log.h"
+#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -31,8 +34,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
static const struct btrfs_csums {
u16 size;
- const char *name;
- const char *driver;
+ const char name[10];
+ const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
@@ -63,11 +66,12 @@ const char *btrfs_super_csum_name(u16 csum_type)
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
- return btrfs_csums[csum_type].driver ?:
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
-size_t __const btrfs_get_num_csums(void)
+size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
@@ -110,6 +114,22 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+bool __cold abort_should_print_stack(int errno)
+{
+ switch (errno) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
+
+/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -143,47 +163,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
return eb;
}
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* cowonly root (everything not a reference counted cow subvolume), just get
- * put onto a simple dirty list. transaction.c walks this to make sure they
- * get properly updated on disk.
+/*
+ * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
+ * just get put onto a simple dirty list. Transaction walks this list to make
+ * sure they get properly updated on disk.
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
@@ -222,9 +205,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -234,7 +217,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -256,605 +240,18 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
-
- if (ret)
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
}
-enum mod_log_op {
- MOD_LOG_KEY_REPLACE,
- MOD_LOG_KEY_ADD,
- MOD_LOG_KEY_REMOVE,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING,
- MOD_LOG_MOVE_KEYS,
- MOD_LOG_ROOT_REPLACE,
-};
-
-struct tree_mod_root {
- u64 logical;
- u8 level;
-};
-
-struct tree_mod_elem {
- struct rb_node node;
- u64 logical;
- u64 seq;
- enum mod_log_op op;
-
- /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
- int slot;
-
- /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
- u64 generation;
-
- /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* this is used for op == MOD_LOG_MOVE_KEYS */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* this is used for op == MOD_LOG_ROOT_REPLACE */
- struct tree_mod_root old_root;
-};
-
-/*
- * Pull a new tree mod seq number for our operation.
- */
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
- return atomic64_inc_return(&fs_info->tree_mod_seq);
-}
-
-/*
- * This adds a new blocker to the tree mod log's blocker list if the @elem
- * passed does not already have a sequence number set. So when a caller expects
- * to record tree modifications, it should ensure to set elem->seq to zero
- * before calling btrfs_get_tree_mod_seq.
- * Returns a fresh, unused tree log modification sequence number, even if no new
- * blocker was added.
- */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- write_lock(&fs_info->tree_mod_log_lock);
- if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq(fs_info);
- list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-
- return elem->seq;
-}
-
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct rb_node *next;
- struct seq_list *cur_elem;
- struct tree_mod_elem *tm;
- u64 min_seq = (u64)-1;
- u64 seq_putting = elem->seq;
-
- if (!seq_putting)
- return;
-
- write_lock(&fs_info->tree_mod_log_lock);
- list_del(&elem->list);
- elem->seq = 0;
-
- list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if (cur_elem->seq < min_seq) {
- if (seq_putting > cur_elem->seq) {
- /*
- * blocker with lower sequence number exists, we
- * cannot remove anything from the log
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = cur_elem->seq;
- }
- }
-
- /*
- * anything that's lower than the lowest existing (read: blocked)
- * sequence number can be removed from the tree.
- */
- tm_root = &fs_info->tree_mod_log;
- for (node = rb_first(tm_root); node; node = next) {
- next = rb_next(node);
- tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq >= min_seq)
- continue;
- rb_erase(node, tm_root);
- kfree(tm);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
-/*
- * key order of the log:
- * node/leaf start address -> sequence
- *
- * The 'start address' is the logical address of the *new* root node
- * for root replace operations, or the logical address of the affected
- * block for all other operations.
- */
-static noinline int
-__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
-{
- struct rb_root *tm_root;
- struct rb_node **new;
- struct rb_node *parent = NULL;
- struct tree_mod_elem *cur;
-
- lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
-
- tm->seq = btrfs_inc_tree_mod_seq(fs_info);
-
- tm_root = &fs_info->tree_mod_log;
- new = &tm_root->rb_node;
- while (*new) {
- cur = rb_entry(*new, struct tree_mod_elem, node);
- parent = *new;
- if (cur->logical < tm->logical)
- new = &((*new)->rb_left);
- else if (cur->logical > tm->logical)
- new = &((*new)->rb_right);
- else if (cur->seq < tm->seq)
- new = &((*new)->rb_left);
- else if (cur->seq > tm->seq)
- new = &((*new)->rb_right);
- else
- return -EEXIST;
- }
-
- rb_link_node(&tm->node, parent, new);
- rb_insert_color(&tm->node, tm_root);
- return 0;
-}
-
-/*
- * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
- * returns zero with the tree_mod_log_lock acquired. The caller must hold
- * this until all tree mod log insertions are recorded in the rb tree and then
- * write unlock fs_info::tree_mod_log_lock.
- */
-static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb) {
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 1;
- if (eb && btrfs_header_level(eb) == 0)
- return 1;
-
- write_lock(&fs_info->tree_mod_log_lock);
- if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- write_unlock(&fs_info->tree_mod_log_lock);
- return 1;
- }
-
- return 0;
-}
-
-/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
-{
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 0;
- if (eb && btrfs_header_level(eb) == 0)
- return 0;
-
- return 1;
-}
-
-static struct tree_mod_elem *
-alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
-
- tm = kzalloc(sizeof(*tm), flags);
- if (!tm)
- return NULL;
-
- tm->logical = eb->start;
- if (op != MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
- tm->op = op;
- tm->slot = slot;
- tm->generation = btrfs_node_ptr_generation(eb, slot);
- RB_CLEAR_NODE(&tm->node);
-
- return tm;
-}
-
-static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
- int ret;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm = alloc_tree_mod_elem(eb, slot, op, flags);
- if (!tm)
- return -ENOMEM;
-
- if (tree_mod_dont_log(eb->fs_info, eb)) {
- kfree(tm);
- return 0;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- kfree(tm);
-
- return ret;
-}
-
-static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
- int dst_slot, int src_slot, int nr_items)
-{
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int ret = 0;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = eb->start;
- tm->slot = src_slot;
- tm->move.dst_slot = dst_slot;
- tm->move.nr_items = nr_items;
- tm->op = MOD_LOG_MOVE_KEYS;
-
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
- locked = 1;
-
- /*
- * When we override something during the move, we log these removals.
- * This can only happen when we move towards the beginning of the
- * buffer, i.e. dst_slot < src_slot.
- */
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
- if (ret)
- goto free_tms;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- if (ret)
- goto free_tms;
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-free_tms:
- for (i = 0; i < nr_items; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
- kfree(tm);
-
- return ret;
-}
-
-static inline int
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
-{
- int i, j;
- int ret;
-
- for (i = nritems - 1; i >= 0; i--) {
- ret = __tree_mod_log_insert(fs_info, tm_list[i]);
- if (ret) {
- for (j = nritems - 1; j > i; j--)
- rb_erase(&tm_list[j]->node,
- &fs_info->tree_mod_log);
- return ret;
- }
- }
-
- return 0;
-}
-
-static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
- struct extent_buffer *new_root, int log_removal)
-{
- struct btrfs_fs_info *fs_info = old_root->fs_info;
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int ret = 0;
- int i;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (log_removal && btrfs_header_level(old_root) > 0) {
- nritems = btrfs_header_nritems(old_root);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list) {
- ret = -ENOMEM;
- goto free_tms;
- }
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
- }
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = new_root->start;
- tm->old_root.logical = old_root->start;
- tm->old_root.level = btrfs_header_level(old_root);
- tm->generation = btrfs_header_generation(old_root);
- tm->op = MOD_LOG_ROOT_REPLACE;
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
-
- if (tm_list)
- ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- if (!ret)
- ret = __tree_mod_log_insert(fs_info, tm);
-
- write_unlock(&fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return ret;
-
-free_tms:
- if (tm_list) {
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
- }
- kfree(tm);
-
- return ret;
-}
-
-static struct tree_mod_elem *
-__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
- int smallest)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct tree_mod_elem *cur = NULL;
- struct tree_mod_elem *found = NULL;
-
- read_lock(&fs_info->tree_mod_log_lock);
- tm_root = &fs_info->tree_mod_log;
- node = tm_root->rb_node;
- while (node) {
- cur = rb_entry(node, struct tree_mod_elem, node);
- if (cur->logical < start) {
- node = node->rb_left;
- } else if (cur->logical > start) {
- node = node->rb_right;
- } else if (cur->seq < min_seq) {
- node = node->rb_left;
- } else if (!smallest) {
- /* we want the node with the highest seq */
- if (found)
- BUG_ON(found->seq > cur->seq);
- found = cur;
- node = node->rb_left;
- } else if (cur->seq > min_seq) {
- /* we want the node with the smallest seq */
- if (found)
- BUG_ON(found->seq < cur->seq);
- found = cur;
- node = node->rb_right;
- } else {
- found = cur;
- break;
- }
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
- return found;
-}
-
-/*
- * this returns the element from the log with the smallest time sequence
- * value that's in the log (the oldest log item). any element with a time
- * sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
- u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 1);
-}
-
-/*
- * this returns the element from the log with the largest time sequence
- * value that's in the log (the most recent log item). any element with
- * a time sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 0);
-}
-
-static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src, unsigned long dst_offset,
- unsigned long src_offset, int nr_items)
-{
- struct btrfs_fs_info *fs_info = dst->fs_info;
- int ret = 0;
- struct tree_mod_elem **tm_list = NULL;
- struct tree_mod_elem **tm_list_add, **tm_list_rem;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
- return 0;
-
- tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm_list_add = tm_list;
- tm_list_rem = tm_list + nr_items;
- for (i = 0; i < nr_items; i++) {
- tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
- if (!tm_list_rem[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
- MOD_LOG_KEY_ADD, GFP_NOFS);
- if (!tm_list_add[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
- locked = 1;
-
- for (i = 0; i < nr_items; i++) {
- ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
- if (ret)
- goto free_tms;
- ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
- if (ret)
- goto free_tms;
- }
-
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nr_items * 2; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return ret;
-}
-
-static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
-{
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int i;
- int ret = 0;
-
- if (btrfs_header_level(eb) == 0)
- return 0;
-
- if (!tree_mod_need_log(eb->fs_info, NULL))
- return 0;
-
- nritems = btrfs_header_nritems(eb);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
-
- ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
-
- return ret;
-}
-
/*
* check if the tree block can be shared by multiple trees
*/
@@ -862,12 +259,11 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf)
{
/*
- * Tree blocks not in reference counted trees and tree roots
- * are never shared. If a block was allocated after the last
- * snapshot and the block was not allocated by tree relocation,
- * we know the block is not shared.
+ * Tree blocks not in shareable trees and tree roots are never shared.
+ * If a block was allocated after the last snapshot and the block was
+ * not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
@@ -962,10 +358,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans,
- buf->start,
- buf->len,
- new_flags, level, 0);
+ ret = btrfs_set_disk_extent_flags(trans, buf,
+ new_flags, level);
if (ret)
return ret;
}
@@ -988,48 +382,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent_start,
- const struct btrfs_disk_key *disk_key,
- int level,
- u64 hint,
- u64 empty_size)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *ret;
-
- /*
- * If we are COWing a node/leaf from the extent, chunk, device or free
- * space trees, make sure that we do not finish block group creation of
- * pending block groups. We do this to avoid a deadlock.
- * COWing can result in allocation of a new chunk, and flushing pending
- * block groups (btrfs_create_pending_block_groups()) can be triggered
- * when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk, device and free space trees,
- * therefore we could deadlock with ourselves since we are holding a
- * lock on an extent buffer that btrfs_create_pending_block_groups() may
- * try to COW later.
- * For similar reasons, we also need to delay flushing pending block
- * groups when splitting a leaf or node, from one of those trees, since
- * we are holding a write lock on it and its parent or when inserting a
- * new root node for one of those trees.
- */
- if (root == fs_info->extent_root ||
- root == fs_info->chunk_root ||
- root == fs_info->dev_root ||
- root == fs_info->free_space_root)
- trans->can_flush_pending_bgs = false;
-
- ret = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, disk_key, level,
- hint, empty_size);
- trans->can_flush_pending_bgs = true;
-
- return ret;
-}
-
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -1047,7 +399,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1060,11 +413,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (*cow_ret == buf)
unlock_orig = 1;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -1077,8 +430,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1099,13 +453,17 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1118,32 +476,34 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
atomic_inc(&cow->refs);
- ret = tree_mod_log_insert_root(root->node, cow, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(parent, parent_slot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ btrfs_tree_mod_log_insert_key(parent, parent_slot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(buf);
+ ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
}
- btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -1153,295 +513,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * returns the logical address of the oldest predecessor of the given root.
- * entries older than time_seq are ignored.
- */
-static struct tree_mod_elem *__tree_mod_log_oldest_root(
- struct extent_buffer *eb_root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- struct tree_mod_elem *found = NULL;
- u64 root_logical = eb_root->start;
- int looped = 0;
-
- if (!time_seq)
- return NULL;
-
- /*
- * the very last operation that's logged for a root is the
- * replacement operation (if it is replaced at all). this has
- * the logical address of the *new* root, making it the very
- * first operation that's logged for this root.
- */
- while (1) {
- tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
- time_seq);
- if (!looped && !tm)
- return NULL;
- /*
- * if there are no tree operation for the oldest root, we simply
- * return it. this should only happen if that (old) root is at
- * level 0.
- */
- if (!tm)
- break;
-
- /*
- * if there's an operation that's not a root replacement, we
- * found the oldest version of our root. normally, we'll find a
- * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
- */
- if (tm->op != MOD_LOG_ROOT_REPLACE)
- break;
-
- found = tm;
- root_logical = tm->old_root.logical;
- looped = 1;
- }
-
- /* if there's no old root to return, return what we found instead */
- if (!found)
- found = tm;
-
- return found;
-}
-
-/*
- * tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewound (until we reach something older than
- * time_seq).
- */
-static void
-__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
- u64 time_seq, struct tree_mod_elem *first_tm)
-{
- u32 n;
- struct rb_node *next;
- struct tree_mod_elem *tm = first_tm;
- unsigned long o_dst;
- unsigned long o_src;
- unsigned long p_size = sizeof(struct btrfs_key_ptr);
-
- n = btrfs_header_nritems(eb);
- read_lock(&fs_info->tree_mod_log_lock);
- while (tm && tm->seq >= time_seq) {
- /*
- * all the operations are recorded with the operator used for
- * the modification. as we're going backwards, we do the
- * opposite of each operation here.
- */
- switch (tm->op) {
- case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
- BUG_ON(tm->slot < n);
- /* Fallthrough */
- case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
- case MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- n++;
- break;
- case MOD_LOG_KEY_REPLACE:
- BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- break;
- case MOD_LOG_KEY_ADD:
- /* if a move operation is needed it's in the log */
- n--;
- break;
- case MOD_LOG_MOVE_KEYS:
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
- memmove_extent_buffer(eb, o_dst, o_src,
- tm->move.nr_items * p_size);
- break;
- case MOD_LOG_ROOT_REPLACE:
- /*
- * this operation is special. for roots, this must be
- * handled explicitly before rewinding.
- * for non-roots, this operation may exist if the node
- * was a root: root A -> child B; then A gets empty and
- * B is promoted to the new root. in the mod log, we'll
- * have a root-replace operation for B, a tree block
- * that is no root. we simply ignore that operation.
- */
- break;
- }
- next = rb_next(&tm->node);
- if (!next)
- break;
- tm = rb_entry(next, struct tree_mod_elem, node);
- if (tm->logical != first_tm->logical)
- break;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
- btrfs_set_header_nritems(eb, n);
-}
-
-/*
- * Called with eb read locked. If the buffer cannot be rewound, the same buffer
- * is returned. If rewind operations happen, a fresh buffer is returned. The
- * returned buffer is always read-locked. If the returned buffer is not the
- * input buffer, the lock on the input buffer is released and the input buffer
- * is freed (its refcount is decremented).
- */
-static struct extent_buffer *
-tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct extent_buffer *eb, u64 time_seq)
-{
- struct extent_buffer *eb_rewin;
- struct tree_mod_elem *tm;
-
- if (!time_seq)
- return eb;
-
- if (btrfs_header_level(eb) == 0)
- return eb;
-
- tm = tree_mod_log_search(fs_info, eb->start, time_seq);
- if (!tm)
- return eb;
-
- btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_read(eb);
-
- if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
- if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- btrfs_set_header_bytenr(eb_rewin, eb->start);
- btrfs_set_header_backref_rev(eb_rewin,
- btrfs_header_backref_rev(eb));
- btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
- btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
- } else {
- eb_rewin = btrfs_clone_extent_buffer(eb);
- if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- }
-
- btrfs_tree_read_unlock_blocking(eb);
- free_extent_buffer(eb);
-
- btrfs_tree_read_lock(eb_rewin);
- __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
- WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb_rewin;
-}
-
-/*
- * get_old_root() rewinds the state of @root's root node to the given @time_seq
- * value. If there are no changes, the current root->root_node is returned. If
- * anything changed in between, there's a fresh buffer allocated on which the
- * rewind operations are done. In any case, the returned buffer is read locked.
- * Returns NULL on error (with no locks held).
- */
-static inline struct extent_buffer *
-get_old_root(struct btrfs_root *root, u64 time_seq)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct tree_mod_elem *tm;
- struct extent_buffer *eb = NULL;
- struct extent_buffer *eb_root;
- u64 eb_root_owner = 0;
- struct extent_buffer *old;
- struct tree_mod_root *old_root = NULL;
- u64 old_generation = 0;
- u64 logical;
- int level;
-
- eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (!tm)
- return eb_root;
-
- if (tm->op == MOD_LOG_ROOT_REPLACE) {
- old_root = &tm->old_root;
- old_generation = tm->generation;
- logical = old_root->logical;
- level = old_root->level;
- } else {
- logical = eb_root->start;
- level = btrfs_header_level(eb_root);
- }
-
- tm = tree_mod_log_search(fs_info, logical, time_seq);
- if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, 0, level, NULL);
- if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
- if (!IS_ERR(old))
- free_extent_buffer(old);
- btrfs_warn(fs_info,
- "failed to read tree block %llu from get_old_root",
- logical);
- } else {
- eb = btrfs_clone_extent_buffer(old);
- free_extent_buffer(old);
- }
- } else if (old_root) {
- eb_root_owner = btrfs_header_owner(eb_root);
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(fs_info, logical);
- } else {
- btrfs_set_lock_blocking_read(eb_root);
- eb = btrfs_clone_extent_buffer(eb_root);
- btrfs_tree_read_unlock_blocking(eb_root);
- free_extent_buffer(eb_root);
- }
-
- if (!eb)
- return NULL;
- btrfs_tree_read_lock(eb);
- if (old_root) {
- btrfs_set_header_bytenr(eb, eb->start);
- btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, eb_root_owner);
- btrfs_set_header_level(eb, old_root->level);
- btrfs_set_header_generation(eb, old_generation);
- }
- if (tm)
- __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
- else
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb;
-}
-
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- int level;
- struct extent_buffer *eb_root = btrfs_root_node(root);
-
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
- level = tm->old_root.level;
- } else {
- level = btrfs_header_level(eb_root);
- }
- free_extent_buffer(eb_root);
-
- return level;
-}
-
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -1480,7 +551,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1500,17 +572,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
- trans->dirty = true;
*cow_ret = buf;
return 0;
}
search_start = buf->start & ~((u64)SZ_1G - 1);
- if (parent)
- btrfs_set_lock_blocking_write(parent);
- btrfs_set_lock_blocking_write(buf);
-
/*
* Before CoWing this block for later modification, check if it's
* the subtree root and do the delayed subtree trace if needed.
@@ -1519,12 +586,13 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
* helper function for defrag to decide if two blocks pointed to by a
@@ -1539,6 +607,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
return 0;
}
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static int comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
/*
* compare two keys in a memcmp fashion
*/
@@ -1551,6 +635,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
return btrfs_comp_cpu_keys(&k1, k2);
}
+#endif
/*
* same as comp_keys only with two btrfs_key's
@@ -1585,7 +670,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
u64 blocknr;
- u64 gen;
u64 search_start = *last_ret;
u64 last_block = 0;
u64 other;
@@ -1593,14 +677,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
int end_slot;
int i;
int err = 0;
- int parent_level;
- int uptodate;
u32 blocksize;
int progress_passed = 0;
struct btrfs_disk_key disk_key;
- parent_level = btrfs_header_level(parent);
-
WARN_ON(trans->transaction != fs_info->running_transaction);
WARN_ON(trans->transid != fs_info->generation);
@@ -1611,10 +691,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking_write(parent);
-
for (i = start_slot; i <= end_slot; i++) {
- struct btrfs_key first_key;
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1623,8 +700,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
- gen = btrfs_node_ptr_generation(parent, i);
- btrfs_node_key_to_cpu(parent, &first_key, i);
if (last_block == 0)
last_block = blocknr;
@@ -1641,40 +716,18 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = find_extent_buffer(fs_info, blocknr);
- if (cur)
- uptodate = btrfs_buffer_uptodate(cur, gen, 0);
- else
- uptodate = 0;
- if (!cur || !uptodate) {
- if (!cur) {
- cur = read_tree_block(fs_info, blocknr, gen,
- parent_level - 1,
- &first_key);
- if (IS_ERR(cur)) {
- return PTR_ERR(cur);
- } else if (!extent_buffer_uptodate(cur)) {
- free_extent_buffer(cur);
- return -EIO;
- }
- } else if (!uptodate) {
- err = btrfs_read_buffer(cur, gen,
- parent_level - 1,&first_key);
- if (err) {
- free_extent_buffer(cur);
- return err;
- }
- }
- }
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
if (search_start == 0)
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1690,31 +743,26 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
}
/*
- * search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart. There are 'max' items in p.
+ * Search for a key in the given extent_buffer.
*
- * the slot in the array is returned via slot, and it points to
- * the place where you would insert key if it is not found in
- * the array.
+ * The lower boundary for the search is specified by the slot number @low. Use a
+ * value of 0 to search over the whole extent buffer.
*
- * slot may point to max if the key is bigger than all of the keys
+ * The slot in the extent buffer is returned via @slot. If the key exists in the
+ * extent buffer, then @slot will point to the slot where the key is, otherwise
+ * it points to the slot where you would insert the key.
+ *
+ * Slot may point to the total number of items (i.e. one position beyond the last
+ * key) if the key is bigger than the last key in the extent buffer.
*/
-static noinline int generic_bin_search(struct extent_buffer *eb,
- unsigned long p, int item_size,
- const struct btrfs_key *key,
- int max, int *slot)
+static noinline int generic_bin_search(struct extent_buffer *eb, int low,
+ const struct btrfs_key *key, int *slot)
{
- int low = 0;
- int high = max;
- int mid;
+ unsigned long p;
+ int item_size;
+ int high = btrfs_header_nritems(eb);
int ret;
- struct btrfs_disk_key *tmp = NULL;
- struct btrfs_disk_key unaligned;
- unsigned long offset;
- char *kaddr = NULL;
- unsigned long map_start = 0;
- unsigned long map_len = 0;
- int err;
+ const int key_size = sizeof(struct btrfs_disk_key);
if (low > high) {
btrfs_err(eb->fs_info,
@@ -1724,33 +772,36 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
return -EINVAL;
}
+ if (btrfs_header_level(eb) == 0) {
+ p = offsetof(struct btrfs_leaf, items);
+ item_size = sizeof(struct btrfs_item);
+ } else {
+ p = offsetof(struct btrfs_node, ptrs);
+ item_size = sizeof(struct btrfs_key_ptr);
+ }
+
while (low < high) {
+ unsigned long oip;
+ unsigned long offset;
+ struct btrfs_disk_key *tmp;
+ struct btrfs_disk_key unaligned;
+ int mid;
+
mid = (low + high) / 2;
offset = p + mid * item_size;
+ oip = offset_in_page(offset);
- if (!kaddr || offset < map_start ||
- (offset + sizeof(struct btrfs_disk_key)) >
- map_start + map_len) {
-
- err = map_private_extent_buffer(eb, offset,
- sizeof(struct btrfs_disk_key),
- &kaddr, &map_start, &map_len);
-
- if (!err) {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
- } else if (err == 1) {
- read_extent_buffer(eb, &unaligned,
- offset, sizeof(unaligned));
- tmp = &unaligned;
- } else {
- return err;
- }
+ if (oip + key_size <= PAGE_SIZE) {
+ const unsigned long idx = get_eb_page_index(offset);
+ char *kaddr = page_address(eb->pages[idx]);
+ oip = get_eb_offset_in_page(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
- map_start);
+ read_extent_buffer(eb, &unaligned, offset, key_size);
+ tmp = &unaligned;
}
+
ret = comp_keys(tmp, key);
if (ret < 0)
@@ -1767,24 +818,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
}
/*
- * simple bin_search frontend that does the right thing for
- * leaves vs nodes
+ * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+ * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot)
+ int *slot)
{
- if (level == 0)
- return generic_bin_search(eb,
- offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item),
- key, btrfs_header_nritems(eb),
- slot);
- else
- return generic_bin_search(eb,
- offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr),
- key, btrfs_header_nritems(eb),
- slot);
+ return generic_bin_search(eb, 0, key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -1820,11 +860,14 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
+ btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
- if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- eb = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
return eb;
@@ -1854,8 +897,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
- WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
- path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
WARN_ON(btrfs_header_generation(mid) != trans->transid);
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1884,15 +926,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
goto enospc;
}
- ret = tree_mod_log_insert_root(root->node, child, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, child);
@@ -1907,7 +949,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
@@ -1921,10 +963,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking_write(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1936,10 +978,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking_write(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1966,14 +1008,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), right,
+ 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2011,15 +1054,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2099,15 +1142,15 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking_write(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2120,8 +1163,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2153,8 +1196,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking_write(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2162,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2175,8 +1217,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2214,12 +1256,12 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
u64 search;
u64 target;
u64 nread = 0;
- struct extent_buffer *eb;
+ u64 nread_max;
u32 nr;
u32 blocksize;
u32 nscan = 0;
- if (level != 1)
+ if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
return;
if (!path->nodes[level])
@@ -2227,12 +1269,30 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
node = path->nodes[level];
+ /*
+ * Since the time between visiting leaves is much shorter than the time
+ * between visiting nodes, limit read ahead of nodes to 1, to avoid too
+ * much IO at once (possibly random).
+ */
+ if (path->reada == READA_FORWARD_ALWAYS) {
+ if (level > 1)
+ nread_max = node->fs_info->nodesize;
+ else
+ nread_max = SZ_128K;
+ } else {
+ nread_max = SZ_64K;
+ }
+
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
- eb = find_extent_buffer(fs_info, search);
- if (eb) {
- free_extent_buffer(eb);
- return;
+ if (path->reada != READA_FORWARD_ALWAYS) {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer(fs_info, search);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
}
target = search;
@@ -2245,7 +1305,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
if (nr == 0)
break;
nr--;
- } else if (path->reada == READA_FORWARD) {
+ } else if (path->reada == READA_FORWARD ||
+ path->reada == READA_FORWARD_ALWAYS) {
nr++;
if (nr >= nritems)
break;
@@ -2256,27 +1317,23 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
break;
}
search = btrfs_node_blockptr(node, nr);
- if ((search <= target && target - search <= 65536) ||
+ if (path->reada == READA_FORWARD_ALWAYS ||
+ (search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
- readahead_tree_block(fs_info, search);
+ btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
- if ((nread > 65536 || nscan > 32))
+ if (nread > nread_max || nscan > 32)
break;
}
}
-static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_path *path, int level)
{
+ struct extent_buffer *parent;
int slot;
int nritems;
- struct extent_buffer *parent;
- struct extent_buffer *eb;
- u64 gen;
- u64 block1 = 0;
- u64 block2 = 0;
parent = path->nodes[level + 1];
if (!parent)
@@ -2285,32 +1342,10 @@ static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- if (slot > 0) {
- block1 = btrfs_node_blockptr(parent, slot - 1);
- gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = find_extent_buffer(fs_info, block1);
- /*
- * if we get -eagain from btrfs_buffer_uptodate, we
- * don't want to return eagain here. That will loop
- * forever
- */
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block1 = 0;
- free_extent_buffer(eb);
- }
- if (slot + 1 < nritems) {
- block2 = btrfs_node_blockptr(parent, slot + 1);
- gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = find_extent_buffer(fs_info, block2);
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block2 = 0;
- free_extent_buffer(eb);
- }
-
- if (block1)
- readahead_tree_block(fs_info, block1);
- if (block2)
- readahead_tree_block(fs_info, block2);
+ if (slot > 0)
+ btrfs_readahead_node_child(parent, slot - 1);
+ if (slot + 1 < nritems)
+ btrfs_readahead_node_child(parent, slot + 1);
}
@@ -2333,33 +1368,34 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
{
int i;
int skip_level = level;
- int no_skips = 0;
- struct extent_buffer *t;
+ bool check_skip = true;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
if (!path->nodes[i])
break;
if (!path->locks[i])
break;
- if (!no_skips && path->slots[i] == 0) {
- skip_level = i + 1;
- continue;
- }
- if (!no_skips && path->keep_locks) {
- u32 nritems;
- t = path->nodes[i];
- nritems = btrfs_header_nritems(t);
- if (nritems < 1 || path->slots[i] >= nritems - 1) {
+
+ if (check_skip) {
+ if (path->slots[i] == 0) {
skip_level = i + 1;
continue;
}
+
+ if (path->keep_locks) {
+ u32 nritems;
+
+ nritems = btrfs_header_nritems(path->nodes[i]);
+ if (nritems < 1 || path->slots[i] >= nritems - 1) {
+ skip_level = i + 1;
+ continue;
+ }
+ }
}
- if (skip_level < i && i >= lowest_unlock)
- no_skips = 1;
- t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level) {
- btrfs_tree_unlock_rw(t, path->locks[i]);
+ check_skip = false;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
i > min_write_lock_level &&
@@ -2371,12 +1407,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * helper function for btrfs_search_slot. The goal is to find a block
- * in cache without setting the path to blocking. If we find the block
- * we return zero and the path is unchanged.
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
*
- * If we can't find the block, we set the path blocking and do some
- * reada. -EAGAIN is returned and the search must be repeated.
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@@ -2386,19 +1423,30 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_fs_info *fs_info = root->fs_info;
u64 blocknr;
u64 gen;
- struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
struct btrfs_key first_key;
int ret;
int parent_level;
+ bool unlock_up;
- blocknr = btrfs_node_blockptr(b, slot);
- gen = btrfs_node_ptr_generation(b, slot);
- parent_level = btrfs_header_level(b);
- btrfs_node_key_to_cpu(b, &first_key, slot);
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
+ blocknr = btrfs_node_blockptr(*eb_ret, slot);
+ gen = btrfs_node_ptr_generation(*eb_ret, slot);
+ parent_level = btrfs_header_level(*eb_ret);
+ btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
+ if (p->reada == READA_FORWARD_ALWAYS)
+ reada_for_search(fs_info, p, level, slot, key->objectid);
+
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
/*
@@ -2415,56 +1463,68 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- btrfs_set_path_blocking(p);
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
- if (!ret) {
- *eb_ret = tmp;
- return 0;
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
}
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read. Don't release the lock on the current
- * level because we need to walk this node to figure
- * out which blocks to read.
- */
- btrfs_unlock_up_safe(p, level + 1);
- btrfs_set_path_blocking(p);
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
- &first_key);
- if (!IS_ERR(tmp)) {
- /*
- * If the read above didn't mark this buffer up to date,
- * it will never end up being up to date. Set ret to EIO now
- * and give up so that our caller doesn't loop forever
- * on our EAGAINs.
- */
- if (!extent_buffer_uptodate(tmp))
- ret = -EIO;
- free_extent_buffer(tmp);
+ tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
+ gen, parent_level - 1, &first_key);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
+ }
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
} else {
- ret = PTR_ERR(tmp);
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
}
- btrfs_release_path(p);
return ret;
}
@@ -2484,74 +1544,45 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
int *write_lock_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = split_node(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = split_node(trans, root, p, level);
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
} else if (ins_len < 0 && btrfs_header_nritems(b) <
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = balance_level(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = balance_level(trans, root, p, level);
+ if (ret)
+ return ret;
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
if (!b) {
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
BUG_ON(btrfs_header_nritems(b) == 1);
}
- return 0;
-
-again:
- ret = -EAGAIN;
-done:
return ret;
}
-static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
- int level, int *prev_cmp, int *slot)
-{
- if (*prev_cmp != 0) {
- *prev_cmp = btrfs_bin_search(b, key, level, slot);
- return *prev_cmp;
- }
-
- *slot = 0;
-
- return 0;
-}
-
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
@@ -2591,35 +1622,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
- /*
- * The commit roots are read only so we always do read locks,
- * and we always must hold the commit_root_sem when doing
- * searches on them, the only exception is send where we don't
- * want to block transaction commits for a long time, so
- * we need to clone the commit root in order to avoid races
- * with transaction commits that create a snapshot of one of
- * the roots used by a send operation.
- */
- if (p->need_commit_sem) {
- down_read(&fs_info->commit_root_sem);
- b = btrfs_clone_extent_buffer(root->commit_root);
- up_read(&fs_info->commit_root_sem);
- if (!b)
- return ERR_PTR(-ENOMEM);
-
- } else {
- b = root->commit_root;
- atomic_inc(&b->refs);
- }
+ b = root->commit_root;
+ atomic_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -2636,6 +1645,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2645,7 +1657,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2662,6 +1680,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -2671,6 +1700,191 @@ out:
return b;
}
+/*
+ * Replace the extent buffer at the lowest level of the path with a cloned
+ * version. The purpose is to be able to use it safely, after releasing the
+ * commit root semaphore, even if relocation is happening in parallel, the
+ * transaction used for relocation is committed and the extent buffer is
+ * reallocated in the next transaction.
+ *
+ * This is used in a context where the caller does not prevent transaction
+ * commits from happening, either by holding a transaction handle or holding
+ * some lock, while it's doing searches through a commit root.
+ * At the moment it's only used for send operations.
+ */
+static int finish_need_commit_sem_search(struct btrfs_path *path)
+{
+ const int i = path->lowest_level;
+ const int slot = path->slots[i];
+ struct extent_buffer *lowest = path->nodes[i];
+ struct extent_buffer *clone;
+
+ ASSERT(path->need_commit_sem);
+
+ if (!lowest)
+ return 0;
+
+ lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
+
+ clone = btrfs_clone_extent_buffer(lowest);
+ if (!clone)
+ return -ENOMEM;
+
+ btrfs_release_path(path);
+ path->nodes[i] = clone;
+ path->slots[i] = slot;
+
+ return 0;
+}
+
+static inline int search_for_key_slot(struct extent_buffer *eb,
+ int search_low_slot,
+ const struct btrfs_key *key,
+ int prev_cmp,
+ int *slot)
+{
+ /*
+ * If a previous call to btrfs_bin_search() on a parent node returned an
+ * exact match (prev_cmp == 0), we can safely assume the target key will
+ * always be at slot 0 on lower levels, since each key pointer
+ * (struct btrfs_key_ptr) refers to the lowest key accessible from the
+ * subtree it points to. Thus we can skip searching lower levels.
+ */
+ if (prev_cmp == 0) {
+ *slot = 0;
+ return 0;
+ }
+
+ return generic_bin_search(eb, search_low_slot, key, slot);
+}
+
+static int search_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const struct btrfs_key *key,
+ struct btrfs_path *path,
+ int ins_len,
+ int prev_cmp)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int leaf_free_space = -1;
+ int search_low_slot = 0;
+ int ret;
+ bool do_bin_search = true;
+
+ /*
+ * If we are doing an insertion, the leaf has enough free space and the
+ * destination slot for the key is not slot 0, then we can unlock our
+ * write lock on the parent, and any other upper nodes, before doing the
+ * binary search on the leaf (with search_for_key_slot()), allowing other
+ * tasks to lock the parent and any other upper nodes.
+ */
+ if (ins_len > 0) {
+ /*
+ * Cache the leaf free space, since we will need it later and it
+ * will not change until then.
+ */
+ leaf_free_space = btrfs_leaf_free_space(leaf);
+
+ /*
+ * !path->locks[1] means we have a single node tree, the leaf is
+ * the root of the tree.
+ */
+ if (path->locks[1] && leaf_free_space >= ins_len) {
+ struct btrfs_disk_key first_key;
+
+ ASSERT(btrfs_header_nritems(leaf) > 0);
+ btrfs_item_key(leaf, &first_key, 0);
+
+ /*
+ * Doing the extra comparison with the first key is cheap,
+ * taking into account that the first key is very likely
+ * already in a cache line because it immediately follows
+ * the extent buffer's header and we have recently accessed
+ * the header's level field.
+ */
+ ret = comp_keys(&first_key, key);
+ if (ret < 0) {
+ /*
+ * The first key is smaller than the key we want
+ * to insert, so we are safe to unlock all upper
+ * nodes and we have to do the binary search.
+ *
+ * We do use btrfs_unlock_up_safe() and not
+ * unlock_up() because the later does not unlock
+ * nodes with a slot of 0 - we can safely unlock
+ * any node even if its slot is 0 since in this
+ * case the key does not end up at slot 0 of the
+ * leaf and there's no need to split the leaf.
+ */
+ btrfs_unlock_up_safe(path, 1);
+ search_low_slot = 1;
+ } else {
+ /*
+ * The first key is >= then the key we want to
+ * insert, so we can skip the binary search as
+ * the target key will be at slot 0.
+ *
+ * We can not unlock upper nodes when the key is
+ * less than the first key, because we will need
+ * to update the key at slot 0 of the parent node
+ * and possibly of other upper nodes too.
+ * If the key matches the first key, then we can
+ * unlock all the upper nodes, using
+ * btrfs_unlock_up_safe() instead of unlock_up()
+ * as stated above.
+ */
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
+ /*
+ * ret is already 0 or 1, matching the result of
+ * a btrfs_bin_search() call, so there is no need
+ * to adjust it.
+ */
+ do_bin_search = false;
+ path->slots[0] = 0;
+ }
+ }
+ }
+
+ if (do_bin_search) {
+ ret = search_for_key_slot(leaf, search_low_slot, key,
+ prev_cmp, &path->slots[0]);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (ins_len > 0) {
+ /*
+ * Item key already exists. In this case, if we are allowed to
+ * insert the item (for example, in dir_item case, item key
+ * collision is allowed), it will be merged with the original
+ * item. Only the item size grows, no new btrfs item will be
+ * added. If search_for_extension is not set, ins_len already
+ * accounts the size btrfs_item, deduct it here so leaf space
+ * check will be correct.
+ */
+ if (ret == 0 && !path->search_for_extension) {
+ ASSERT(ins_len >= sizeof(struct btrfs_item));
+ ins_len -= sizeof(struct btrfs_item);
+ }
+
+ ASSERT(leaf_free_space >= 0);
+
+ if (leaf_free_space < ins_len) {
+ int err;
+
+ err = split_leaf(trans, root, key, path, ins_len,
+ (ret == 0));
+ ASSERT(err <= 0);
+ if (WARN_ON(err > 0))
+ err = -EUCLEAN;
+ if (err)
+ ret = err;
+ }
+ }
+
+ return ret;
+}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
@@ -2680,8 +1894,14 @@ out:
* @p: Holds all btree nodes along the search path
* @root: The root node of the tree
* @key: The key we are looking for
- * @ins_len: Indicates purpose of search, for inserts it is 1, for
- * deletions it's -1. 0 for plain searches
+ * @ins_len: Indicates purpose of search:
+ * >0 for inserts it's size of item inserted (*)
+ * <0 for deletions
+ * 0 for plain searches, not modifying the tree
+ *
+ * (*) If size of item inserted doesn't include
+ * sizeof(struct btrfs_item), then p->search_for_extension must
+ * be set.
* @cow: boolean should CoW operations be performed. Must always be 1
* when modifying the tree.
*
@@ -2701,6 +1921,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
@@ -2718,6 +1939,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
if (ins_len < 0) {
lowest_unlock = 2;
@@ -2742,6 +1970,16 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
min_write_lock_level = write_lock_level;
+ if (p->need_commit_sem) {
+ ASSERT(p->search_commit_root);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
+ }
+
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
@@ -2763,10 +2001,8 @@ again:
* then we don't want to set the path blocking,
* so we test it here
*/
- if (!should_cow_block(trans, root, b)) {
- trans->dirty = true;
+ if (!should_cow_block(trans, root, b))
goto cow_done;
- }
/*
* must have write locks on this node and the
@@ -2781,14 +2017,15 @@ again:
goto again;
}
- btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2796,10 +2033,6 @@ again:
}
cow_done:
p->nodes[level] = b;
- /*
- * Leave path with blocking locks to avoid massive
- * lock context switch, this is made on purpose.
- */
/*
* we have a lock on b and as long as we aren't changing
@@ -2821,35 +2054,22 @@ cow_done:
}
}
- ret = key_search(b, key, level, &prev_cmp, &slot);
- if (ret < 0)
- goto done;
-
if (level == 0) {
- p->slots[level] = slot;
- if (ins_len > 0 &&
- btrfs_leaf_free_space(b) < ins_len) {
- if (write_lock_level < 1) {
- write_lock_level = 1;
- btrfs_release_path(p);
- goto again;
- }
+ if (ins_len > 0)
+ ASSERT(write_lock_level >= 1);
- btrfs_set_path_blocking(p);
- err = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
-
- BUG_ON(err > 0);
- if (err) {
- ret = err;
- goto done;
- }
- }
+ ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL);
goto done;
}
+
+ ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
+ prev_cmp = ret;
+
if (ret && slot > 0) {
dec = 1;
slot--;
@@ -2896,15 +2116,20 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
+
+ btrfs_maybe_reset_lockdep_class(root, b);
+
if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
+ btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
btrfs_tree_read_lock(b);
}
p->locks[level] = BTRFS_READ_LOCK;
@@ -2914,16 +2139,21 @@ cow_done:
}
ret = 1;
done:
- /*
- * we don't really know what they plan on doing with the path
- * from here on, so for now just mark it as blocking
- */
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
+
+ if (p->need_commit_sem) {
+ int ret2;
+
+ ret2 = finish_need_commit_sem_search(p);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
/*
* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
@@ -2947,10 +2177,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
- int prev_cmp = -1;
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
@@ -2958,7 +2188,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
}
again:
- b = get_old_root(root, time_seq);
+ b = btrfs_get_old_root(root, time_seq);
if (!b) {
ret = -EIO;
goto done;
@@ -2980,12 +2210,7 @@ again:
*/
btrfs_unlock_up_safe(p, level + 1);
- /*
- * Since we can unwind ebs we want to do a real search every
- * time.
- */
- prev_cmp = -1;
- ret = key_search(b, key, level, &prev_cmp, &slot);
+ ret = btrfs_bin_search(b, key, &slot);
if (ret < 0)
goto done;
@@ -3017,11 +2242,8 @@ again:
}
level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ btrfs_tree_read_lock(b);
+ b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -3031,8 +2253,6 @@ again:
}
ret = 1;
done:
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0)
btrfs_release_path(p);
@@ -3117,6 +2337,64 @@ again:
}
/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ int ret;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+ if (ret == 0)
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+ return ret;
+}
+
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -3137,8 +2415,8 @@ static void fixup_low_keys(struct btrfs_path *path,
if (!path->nodes[i])
break;
t = path->nodes[i];
- ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
- GFP_ATOMIC);
+ ret = btrfs_tree_mod_log_insert_key(t, tslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -3200,6 +2478,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3243,7 +2573,13 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
- ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3255,8 +2591,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * Don't call tree_mod_log_insert_move here, key removal was
- * already fully logged by tree_mod_log_eb_copy above.
+ * Don't call btrfs_tree_mod_log_insert_move() here, key removal
+ * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
@@ -3311,15 +2647,21 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
- ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
- push_items);
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+ push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3366,8 +2708,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3384,7 +2727,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- ret = tree_mod_log_insert_root(root->node, c, 0);
+ ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, c);
@@ -3394,7 +2737,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
atomic_inc(&c->refs);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
return 0;
}
@@ -3416,15 +2759,15 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
int ret;
BUG_ON(!path->nodes[level]);
- btrfs_assert_tree_locked(path->nodes[level]);
+ btrfs_assert_tree_write_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
if (slot != nritems) {
if (level) {
- ret = tree_mod_log_insert_move(lower, slot + 1, slot,
- nritems - slot);
+ ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
+ slot, nritems - slot);
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
@@ -3433,8 +2776,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(lower, slot,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -3475,9 +2818,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
* tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
- * elements below with tree_mod_log_eb_copy. We're holding a
- * tree lock on the buffer, which is why we cannot race with
- * other tree_mod_log users.
+ * elements below with btrfs_tree_mod_log_eb_copy(). We're
+ * holding a tree lock on the buffer, which is why we cannot
+ * race with other tree_mod_log users.
*/
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
@@ -3496,15 +2839,16 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0,
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
root_add_used(root, fs_info->nodesize);
ASSERT(btrfs_header_level(c) == level);
- ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
+ ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3515,7 +2859,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
- ret = 0;
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
@@ -3533,7 +2876,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(split);
free_extent_buffer(split);
}
- return ret;
+ return 0;
}
/*
@@ -3543,21 +2886,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
- struct btrfs_item *start_item;
- struct btrfs_item *end_item;
- struct btrfs_map_token token;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
- btrfs_init_map_token(&token, l);
- start_item = btrfs_item_nr(start);
- end_item = btrfs_item_nr(end);
- data_len = btrfs_token_item_offset(l, start_item, &token) +
- btrfs_token_item_size(l, start_item, &token);
- data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
+ data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
+ data_len = data_len - btrfs_item_offset(l, end);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
@@ -3604,7 +2940,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
u32 i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 nr;
u32 right_nritems;
u32 data_end;
@@ -3621,8 +2956,6 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] > i)
break;
@@ -3637,12 +2970,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(left, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(left, i);
+ if (this_item_size + sizeof(struct btrfs_item) +
+ push_space > free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
if (i == 0)
break;
i--;
@@ -3656,7 +2990,7 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* push left to right */
right_nritems = btrfs_header_nritems(right);
- push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+ push_space = btrfs_item_data_end(left, left_nritems - push_items);
push_space -= leaf_data_end(left);
/* make room in the right data area */
@@ -3687,9 +3021,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
- push_space -= btrfs_token_item_size(right, item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space -= btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
left_nritems -= push_items;
@@ -3758,7 +3091,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
@@ -3768,27 +3101,27 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking_write(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
goto out_unlock;
- /* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(right);
- if (free_space < data_size)
- goto out_unlock;
-
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3829,7 +3162,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int i;
int push_space = 0;
int push_items = 0;
- struct btrfs_item *item;
u32 old_left_nritems;
u32 nr;
int ret = 0;
@@ -3843,8 +3175,6 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
- item = btrfs_item_nr(i);
-
if (!empty && push_items > 0) {
if (path->slots[0] < i)
break;
@@ -3859,12 +3189,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (path->slots[0] == i)
push_space += data_size;
- this_item_size = btrfs_item_size(right, item);
- if (this_item_size + sizeof(*item) + push_space > free_space)
+ this_item_size = btrfs_item_size(right, i);
+ if (this_item_size + sizeof(struct btrfs_item) + push_space >
+ free_space)
break;
push_items++;
- push_space += this_item_size + sizeof(*item);
+ push_space += this_item_size + sizeof(struct btrfs_item);
}
if (push_items == 0) {
@@ -3880,27 +3211,24 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
push_items * sizeof(struct btrfs_item));
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
- btrfs_item_offset_nr(right, push_items - 1);
+ btrfs_item_offset(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
- btrfs_item_offset_nr(right, push_items - 1),
+ btrfs_item_offset(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
btrfs_init_map_token(&token, left);
- old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+ old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
-
- ioff = btrfs_token_item_offset(left, item, &token);
- btrfs_set_token_item_offset(left, item,
- ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
- &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -3910,7 +3238,7 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
right_nritems);
if (push_items < right_nritems) {
- push_space = btrfs_item_offset_nr(right, push_items - 1) -
+ push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
@@ -3928,11 +3256,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(i);
-
- push_space = push_space - btrfs_token_item_size(right,
- item, &token);
- btrfs_set_token_item_offset(right, item, push_space, &token);
+ push_space = push_space - btrfs_token_item_size(&token, i);
+ btrfs_set_token_item_offset(&token, i, push_space);
}
btrfs_mark_buffer_dirty(left);
@@ -3993,7 +3318,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (right_nritems == 0)
return 1;
- btrfs_assert_tree_locked(path->nodes[1]);
+ btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
@@ -4003,8 +3328,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking_write(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -4012,9 +3336,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- /* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -4022,12 +3346,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(left);
- if (free_space < data_size) {
- ret = 1;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
goto out;
}
-
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4056,7 +3378,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
+ data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
@@ -4067,16 +3389,14 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(l), data_copy_size);
- rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
- ioff = btrfs_token_item_offset(right, item, &token);
- btrfs_set_token_item_offset(right, item,
- ioff + rt_data_off, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -4192,7 +3512,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ if (extend && data_size + btrfs_item_size(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
@@ -4277,8 +3597,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0,
+ num_doubles ? BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4351,7 +3681,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -4371,7 +3701,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
leaf = path->nodes[0];
/* if our item isn't there, return now */
- if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ if (item_size != btrfs_item_size(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
@@ -4385,7 +3715,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
goto err;
}
- btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
if (ret)
goto err;
@@ -4403,9 +3732,7 @@ static noinline int split_item(struct btrfs_path *path,
unsigned long split_offset)
{
struct extent_buffer *leaf;
- struct btrfs_item *item;
- struct btrfs_item *new_item;
- int slot;
+ int orig_slot, slot;
char *buf;
u32 nritems;
u32 item_size;
@@ -4415,11 +3742,9 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- btrfs_set_path_blocking(path);
-
- item = btrfs_item_nr(path->slots[0]);
- orig_offset = btrfs_item_offset(leaf, item);
- item_size = btrfs_item_size(leaf, item);
+ orig_slot = path->slots[0];
+ orig_offset = btrfs_item_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
buf = kmalloc(item_size, GFP_NOFS);
if (!buf)
@@ -4440,14 +3765,12 @@ static noinline int split_item(struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
- new_item = btrfs_item_nr(slot);
+ btrfs_set_item_offset(leaf, slot, orig_offset);
+ btrfs_set_item_size(leaf, slot, item_size - split_offset);
- btrfs_set_item_offset(leaf, new_item, orig_offset);
- btrfs_set_item_size(leaf, new_item, item_size - split_offset);
-
- btrfs_set_item_offset(leaf, item,
- orig_offset + item_size - split_offset);
- btrfs_set_item_size(leaf, item, split_offset);
+ btrfs_set_item_offset(leaf, orig_slot,
+ orig_offset + item_size - split_offset);
+ btrfs_set_item_size(leaf, orig_slot, split_offset);
btrfs_set_header_nritems(leaf, nritems + 1);
@@ -4499,42 +3822,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
}
/*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct btrfs_key *new_key)
-{
- struct extent_buffer *leaf;
- int ret;
- u32 item_size;
-
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ret = setup_leaf_for_split(trans, root, path,
- item_size + sizeof(struct btrfs_item));
- if (ret)
- return ret;
-
- path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
- leaf = path->nodes[0];
- memcpy_extent_buffer(leaf,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
- item_size);
- return 0;
-}
-
-/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
@@ -4544,7 +3831,6 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data_start;
@@ -4556,14 +3842,14 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
leaf = path->nodes[0];
slot = path->slots[0];
- old_size = btrfs_item_size_nr(leaf, slot);
+ old_size = btrfs_item_size(leaf, slot);
if (old_size == new_size)
return;
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
- old_data_start = btrfs_item_offset_nr(leaf, slot);
+ old_data_start = btrfs_item_offset(leaf, slot);
size_diff = old_size - new_size;
@@ -4577,11 +3863,9 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + size_diff, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + size_diff);
}
/* shift the data */
@@ -4624,8 +3908,7 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
fixup_low_keys(path, &disk_key, 1);
}
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, new_size);
+ btrfs_set_item_size(leaf, slot, new_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -4641,7 +3924,6 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
- struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data;
@@ -4659,7 +3941,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
BUG();
}
slot = path->slots[0];
- old_data = btrfs_item_end_nr(leaf, slot);
+ old_data = btrfs_item_data_end(leaf, slot);
BUG_ON(slot < 0);
if (slot >= nritems) {
@@ -4676,11 +3958,9 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - data_size, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff - data_size);
}
/* shift the data */
@@ -4689,9 +3969,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end, old_data - data_end);
data_end = old_data;
- old_size = btrfs_item_size_nr(leaf, slot);
- item = btrfs_item_nr(slot);
- btrfs_set_item_size(leaf, item, old_size + data_size);
+ old_size = btrfs_item_size(leaf, slot);
+ btrfs_set_item_size(leaf, slot, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -4700,17 +3979,19 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @batch: information about the batch of items to insert
*/
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+ const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
@@ -4718,9 +3999,15 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ /*
+ * Before anything else, update keys in the parent and other ancestors
+ * if needed, then release the write locks on them, so that other tasks
+ * can use them while we modify the leaf.
+ */
if (path->slots[0] == 0) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4730,6 +4017,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
if (btrfs_leaf_free_space(leaf) < total_size) {
btrfs_print_leaf(leaf);
@@ -4740,11 +4028,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
- unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+ unsigned int old_data = btrfs_item_data_end(leaf, slot);
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4755,35 +4044,33 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff - total_data, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i,
+ ioff - batch->total_data_size);
}
/* shift the items */
- memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
- data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
- data_end, old_data - data_end);
+ data_end - batch->total_data_size,
+ BTRFS_LEAF_DATA_OFFSET + data_end,
+ old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
- for (i = 0; i < nr; i++) {
- btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+ for (i = 0; i < batch->nr; i++) {
+ btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(leaf, item,
- data_end - data_size[i], &token);
- data_end -= data_size[i];
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
+ data_end -= batch->data_sizes[i];
+ btrfs_set_token_item_offset(&token, slot + i, data_end);
+ btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
}
- btrfs_set_header_nritems(leaf, nritems + nr);
+ btrfs_set_header_nritems(leaf, nritems + batch->nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
@@ -4793,26 +4080,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
}
/*
+ * Insert a new item into a leaf.
+ *
+ * @root: The root of the btree.
+ * @path: A path pointing to the target leaf and slot.
+ * @key: The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size)
+{
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ setup_items_for_insert(root, path, &batch);
+}
+
+/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+ const struct btrfs_item_batch *batch)
{
int ret = 0;
int slot;
- int i;
- u32 total_size = 0;
- u32 total_data = 0;
-
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
+ u32 total_size;
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
@@ -4821,8 +4125,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, batch);
return 0;
}
@@ -4854,6 +4157,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *new_key)
+{
+ struct extent_buffer *leaf;
+ int ret;
+ u32 item_size;
+
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ ret = setup_leaf_for_split(trans, root, path,
+ item_size + sizeof(struct btrfs_item));
+ if (ret)
+ return ret;
+
+ path->slots[0]++;
+ btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ leaf = path->nodes[0];
+ memcpy_extent_buffer(leaf,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+ item_size);
+ return 0;
+}
+
+/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
@@ -4869,8 +4206,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level) {
- ret = tree_mod_log_insert_move(parent, slot, slot + 1,
- nritems - slot - 1);
+ ret = btrfs_tree_mod_log_insert_move(parent, slot,
+ slot + 1, nritems - slot - 1);
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
@@ -4879,8 +4216,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, slot,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
BUG_ON(ret < 0);
}
@@ -4926,7 +4263,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
@@ -4938,25 +4275,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- struct btrfs_item *item;
- u32 last_off;
- u32 dsize = 0;
int ret = 0;
int wret;
- int i;
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
-
- for (i = 0; i < nr; i++)
- dsize += btrfs_item_size_nr(leaf, slot + i);
-
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(leaf);
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
@@ -4967,10 +4301,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(i);
- ioff = btrfs_token_item_offset(leaf, item, &token);
- btrfs_set_token_item_offset(leaf, item,
- ioff + dsize, &token);
+ ioff = btrfs_token_item_offset(&token, i);
+ btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -4986,7 +4318,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
- btrfs_set_path_blocking(path);
btrfs_clean_tree_block(leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
@@ -4999,25 +4330,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fixup_low_keys(path, &disk_key, 1);
}
- /* delete the leaf if it is mostly empty */
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
-
- btrfs_set_path_blocking(path);
- wret = push_leaf_left(trans, root, path, 1, 1,
- 1, (u32)-1);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1,
- 1, 1, 0);
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
@@ -5126,6 +4482,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
+ ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -5141,7 +4498,7 @@ again:
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = btrfs_bin_search(cur, min_key, level, &slot);
+ sret = btrfs_bin_search(cur, min_key, &slot);
if (sret < 0) {
ret = sret;
goto out;
@@ -5160,7 +4517,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5179,7 +4536,6 @@ find_next_key:
*/
if (slot >= nritems) {
path->slots[level] = slot;
- btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -5196,7 +4552,6 @@ find_next_key:
ret = 0;
goto out;
}
- btrfs_set_path_blocking(path);
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -5213,7 +4568,6 @@ out:
path->keep_locks = keep_locks;
if (ret == 0) {
btrfs_unlock_up_safe(path, path->lowest_level + 1);
- btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
}
return ret;
@@ -5295,16 +4649,6 @@ next:
return 1;
}
-/*
- * search the tree again to find a leaf with greater keys
- * returns 0 if it found something or 1 if there are no greater leaves.
- * returns < 0 on io errors.
- */
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
- return btrfs_next_old_leaf(root, path, 0);
-}
-
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
@@ -5312,11 +4656,14 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int level;
struct extent_buffer *c;
struct extent_buffer *next;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
+ bool need_commit_sem = false;
u32 nritems;
int ret;
- int old_spinning = path->leave_spinning;
- int next_rw_lock = 0;
+ int i;
+
+ ASSERT(!path->nowait);
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
@@ -5326,20 +4673,24 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
again:
level = 1;
next = NULL;
- next_rw_lock = 0;
btrfs_release_path(path);
path->keep_locks = 1;
- path->leave_spinning = 1;
- if (time_seq)
+ if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
- else
+ } else {
+ if (path->need_commit_sem) {
+ path->need_commit_sem = 0;
+ need_commit_sem = true;
+ down_read(&fs_info->commit_root_sem);
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ }
path->keep_locks = 0;
if (ret < 0)
- return ret;
+ goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
@@ -5390,13 +4741,22 @@ again:
continue;
}
- if (next) {
- btrfs_tree_unlock_rw(next, next_rw_lock);
- free_extent_buffer(next);
+
+ /*
+ * Our current level is where we're going to start from, and to
+ * make sure lockdep doesn't complain we need to drop our locks
+ * and nodes from 0 to our current level.
+ */
+ for (i = 0; i < level; i++) {
+ if (path->locks[level]) {
+ btrfs_tree_read_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
}
next = c;
- next_rw_lock = path->locks[level];
ret = read_block_for_search(root, path, &next, level,
slot, &key);
if (ret == -EAGAIN)
@@ -5422,26 +4782,18 @@ again:
cond_resched();
goto again;
}
- if (!ret) {
- btrfs_set_path_blocking(path);
+ if (!ret)
btrfs_tree_read_lock(next);
- }
- next_rw_lock = BTRFS_READ_LOCK;
}
break;
}
path->slots[level] = slot;
while (1) {
level--;
- c = path->nodes[level];
- if (path->locks[level])
- btrfs_tree_unlock_rw(c, path->locks[level]);
-
- free_extent_buffer(c);
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
- path->locks[level] = next_rw_lock;
+ path->locks[level] = BTRFS_READ_LOCK;
if (!level)
break;
@@ -5455,21 +4807,21 @@ again:
goto done;
}
- if (!path->skip_locking) {
- ret = btrfs_try_tree_read_lock(next);
- if (!ret) {
- btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
- }
- next_rw_lock = BTRFS_READ_LOCK;
- }
+ if (!path->skip_locking)
+ btrfs_tree_read_lock(next);
}
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
- path->leave_spinning = old_spinning;
- if (!old_spinning)
- btrfs_set_path_blocking(path);
+ if (need_commit_sem) {
+ int ret2;
+
+ path->need_commit_sem = 1;
+ ret2 = finish_need_commit_sem_search(path);
+ up_read(&fs_info->commit_root_sem);
+ if (ret2)
+ ret = ret2;
+ }
return ret;
}
@@ -5491,7 +4843,6 @@ int btrfs_previous_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
@@ -5533,7 +4884,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 36df977b64d9..9e6d48ff4597 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -17,7 +17,6 @@
#include <linux/wait.h>
#include <linux/slab.h>
#include <trace/events/btrfs.h>
-#include <asm/kmap_types.h>
#include <asm/unaligned.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
@@ -28,11 +27,13 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include <linux/iomap.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "locking.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -41,12 +42,18 @@ struct btrfs_delayed_ref_root;
struct btrfs_space_info;
struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
+struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -66,12 +73,6 @@ struct btrfs_ref;
#define BTRFS_OLDEST_GENERATION 0ULL
/*
- * the max metadata block size. This limit is somewhat artificial,
- * but the memmove costs go through the roof for larger blocks.
- */
-#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
-
-/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
*/
@@ -110,14 +111,6 @@ struct btrfs_ref;
#define BTRFS_STAT_CURR 0
#define BTRFS_STAT_PREV 1
-/*
- * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
- */
-static inline u32 count_max_extents(u64 size)
-{
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-}
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -136,6 +129,8 @@ enum {
* defrag
*/
BTRFS_FS_STATE_REMOUNTING,
+ /* Filesystem in RO mode */
+ BTRFS_FS_STATE_RO,
/* Track if a transaction abort has been reported on this filesystem */
BTRFS_FS_STATE_TRANS_ABORTED,
/*
@@ -145,6 +140,13 @@ enum {
BTRFS_FS_STATE_DEV_REPLACING,
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
+
+ BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
};
#define BTRFS_BACKREF_REV_MAX 256
@@ -220,6 +222,16 @@ struct btrfs_root_backup {
u8 unused_8[10];
} __attribute__ ((__packed__));
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+/*
+ * The reserved space at the beginning of each device.
+ * It covers the primary super block and leaves space for potential use by other
+ * tools like bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+
/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
@@ -239,8 +251,12 @@ struct btrfs_super_block {
__le64 chunk_root;
__le64 log_root;
- /* this will help find the new super based on the log root */
- __le64 log_root_transid;
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
__le64 total_bytes;
__le64 bytes_used;
__le64 root_dir_objectid;
@@ -269,10 +285,15 @@ struct btrfs_super_block {
u8 metadata_uuid[BTRFS_FSID_SIZE];
/* future expansion */
- __le64 reserved[28];
+ u8 reserved8[8];
+ __le64 reserved[27];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+ /* Padded to 4096 bytes */
+ u8 padding[565];
} __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
* Compat flags that we support. If any incompat flags are set other than the
@@ -284,11 +305,33 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
- BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -301,7 +344,9 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34)
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -344,6 +389,27 @@ struct btrfs_node {
struct btrfs_key_ptr ptrs[];
} __attribute__ ((__packed__));
+/* Read ahead values for struct btrfs_path.reada */
+enum {
+ READA_NONE,
+ READA_BACK,
+ READA_FORWARD,
+ /*
+ * Similar to READA_FORWARD but unlike it:
+ *
+ * 1) It will trigger readahead even for leaves that are not close to
+ * each other on disk;
+ * 2) It also triggers readahead for nodes;
+ * 3) During a search, even when a node or leaf is already in memory, it
+ * will still trigger readahead for other nodes and leaves that follow
+ * it.
+ *
+ * This is meant to be used only when we know we are iterating over the
+ * entire tree or a very large part of it.
+ */
+ READA_FORWARD_ALWAYS,
+};
+
/*
* btrfs_paths remember the path taken from the root down to the leaf.
* level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
@@ -352,7 +418,6 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
-enum { READA_NONE, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
@@ -369,13 +434,19 @@ struct btrfs_path {
unsigned int search_for_split:1;
unsigned int keep_locks:1;
unsigned int skip_locking:1;
- unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ /*
+ * Indicate that new item (btrfs_search_slot) is extending already
+ * existing item and ins_len contains only the data size and not item
+ * header (ie. sizeof(struct btrfs_item) is not included).
+ */
+ unsigned int search_for_extension:1;
+ /* Stop search if any locks need to be taken (for read) */
+ unsigned int nowait:1;
};
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
- sizeof(struct btrfs_item))
+
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
time64_t time_started; /* seconds since 1-Jan-1970 */
@@ -432,22 +503,6 @@ struct btrfs_free_cluster {
struct list_head block_group_list;
};
-enum btrfs_caching_type {
- BTRFS_CACHE_NO,
- BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FAST,
- BTRFS_CACHE_FINISHED,
- BTRFS_CACHE_ERROR,
-};
-
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
- struct rb_root root;
- struct mutex lock;
-};
-
/* Discard control. */
/*
* Async discard uses multiple lists to differentiate the discard filter
@@ -467,10 +522,11 @@ struct btrfs_discard_ctl {
struct btrfs_block_group *block_group;
struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
u64 prev_discard;
+ u64 prev_discard_time;
atomic_t discardable_extents;
atomic64_t discardable_bytes;
u64 max_discard_size;
- unsigned long delay;
+ u64 delay_ms;
u32 iops_limit;
u32 kbps_limit;
u64 discard_extent_bytes;
@@ -478,54 +534,7 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-/* delayed seq elem */
-struct seq_list {
- struct list_head list;
- u64 seq;
-};
-
-#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
-
-#define SEQ_LAST ((u64)-1)
-
-enum btrfs_orphan_cleanup_state {
- ORPHAN_CLEANUP_STARTED = 1,
- ORPHAN_CLEANUP_DONE = 2,
-};
-
-void btrfs_init_async_reclaim_work(struct work_struct *work);
-
-/* fs_info */
-struct reloc_control;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-
-/*
- * Block group or device which contains an active swapfile. Used for preventing
- * unsafe operations while a swapfile is active.
- *
- * These are sorted on (ptr, inode) (note that a block group or device can
- * contain more than one swapfile). We compare the pointer values because we
- * don't actually care what the object is, we just need a quick check whether
- * the object exists in the rbtree.
- */
-struct btrfs_swapfile_pin {
- struct rb_node node;
- void *ptr;
- struct inode *inode;
- /*
- * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
- * points to a struct btrfs_device.
- */
- bool is_block_group;
-};
-
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
enum {
- BTRFS_FS_BARRIER,
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
BTRFS_FS_LOG_RECOVERING,
@@ -540,22 +549,17 @@ enum {
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
- * To info transaction_kthread we need an immediate commit so it
- * doesn't need to wait for commit_interval
- */
- BTRFS_FS_NEED_ASYNC_COMMIT,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
- * Set and cleared while holding fs_info::balance_mutex.
*/
BTRFS_FS_BALANCE_RUNNING,
+ /*
+ * Indicate that relocation of a chunk has started, it's set per chunk
+ * and is toggled between chunks.
+ */
+ BTRFS_FS_RELOC_RUNNING,
+
/* Indicate that the cleaner thread is awake and doing something. */
BTRFS_FS_CLEANER_RUNNING,
@@ -567,37 +571,89 @@ enum {
/* Indicate that the discard workqueue can service discards. */
BTRFS_FS_DISCARD_RUNNING,
+
+ /* Indicate that we need to cleanup space cache v1 */
+ BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
+ /* Indicate that we want the transaction kthread to commit right now. */
+ BTRFS_FS_COMMIT_TRANS,
+
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
+#if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+ BTRFS_FS_32BIT_WARN,
+#endif
+};
+
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE_PAUSED,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
};
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
- struct btrfs_root *extent_root;
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
- struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
- struct btrfs_root *free_space_root;
+ struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
+ /* The tree that holds the global roots (csum, extent, etc) */
+ rwlock_t global_root_lock;
+ struct rb_root global_root_tree;
+
spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
- spinlock_t block_group_cache_lock;
- u64 first_logical_byte;
- struct rb_root block_group_cache_tree;
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
/* keep track of unallocated space */
atomic64_t free_chunk_space;
- struct extent_io_tree freed_extents[2];
- struct extent_io_tree *pinned_extents;
+ /* Track ranges which are used by log trees blocks/logged data extents */
+ struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
struct extent_map_tree mapping_tree;
@@ -620,6 +676,12 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
+ /*
+ * Generation of the last transaction used for block group relocation
+ * since the filesystem was last mounted (or 0 if none happened yet).
+ * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ */
+ u64 last_reloc_trans;
u64 avg_delayed_ref_runtime;
/*
@@ -696,7 +758,6 @@ struct btrfs_fs_info {
struct rw_semaphore cleanup_work_sem;
struct rw_semaphore subvol_sem;
- struct srcu_struct subvol_srcu;
spinlock_t trans_lock;
/*
@@ -753,18 +814,17 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
- struct btrfs_workqueue *endio_workers;
- struct btrfs_workqueue *endio_meta_workers;
- struct btrfs_workqueue *endio_raid56_workers;
- struct btrfs_workqueue *endio_repair_workers;
- struct btrfs_workqueue *rmw_workers;
- struct btrfs_workqueue *endio_meta_write_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *endio_raid56_workers;
+ struct workqueue_struct *rmw_workers;
+ struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
- struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
@@ -779,13 +839,13 @@ struct btrfs_fs_info {
u32 thread_pool_size;
struct kobject *space_info_kobj;
-
- u64 total_pinned;
+ struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
- struct percpu_counter dio_bytes;
+ struct percpu_counter ordered_bytes;
s32 dirty_metadata_batch;
s32 delalloc_batch;
@@ -834,6 +894,9 @@ struct btrfs_fs_info {
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
+ /* Cancellation requests for chunk relocation */
+ atomic_t reloc_cancel_req;
+
u32 data_chunk_allocations;
u32 metadata_ratio;
@@ -851,9 +914,10 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- struct btrfs_workqueue *scrub_workers;
- struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_parity_workers;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
+ struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -873,7 +937,10 @@ struct btrfs_fs_info {
*/
struct ulist *qgroup_ulist;
- /* protect user change for quota operations */
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
struct mutex qgroup_ioctl_lock;
/* list of dirty qgroups to be written at next commit */
@@ -889,21 +956,16 @@ struct btrfs_fs_info {
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
+ u8 qgroup_drop_subtree_thres;
/* filesystem state */
unsigned long fs_state;
struct btrfs_delayed_root *delayed_root;
- /* readahead tree */
- spinlock_t reada_lock;
- struct radix_tree_root reada_tree;
-
- /* readahead works cnt */
- atomic_t reada_works_cnt;
-
/* Extent buffer radix tree */
spinlock_t buffer_lock;
+ /* Entries are eb->start / sectorsize */
struct radix_tree_root buffer_radix;
/* next backup root to be overwritten */
@@ -916,28 +978,88 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
+
+ /* Reclaim partially filled block groups in the background */
+ struct work_struct reclaim_bgs_work;
+ struct list_head reclaim_bgs;
+ int bg_reclaim_threshold;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
- struct mutex delete_unused_bgs_mutex;
+ /* Protect block groups that are going to be deleted */
+ struct mutex reclaim_bgs_lock;
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
+ /* ilog2 of sectorsize, use to avoid 64bit division */
+ u32 sectorsize_bits;
+ u32 csum_size;
+ u32 csums_per_leaf;
u32 stripesize;
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
/* Block groups and devices containing active swapfiles. */
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
struct crypto_shash *csum_shash;
+ /* Type of exclusive operation running, protected by super_lock */
+ enum btrfs_exclusive_operation exclusive_operation;
+
/*
- * Number of send operations in progress.
- * Updated while holding fs_info::balance_mutex.
+ * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+ * if the mode is enabled
*/
- int send_in_progress;
+ u64 zone_size;
+
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
+
+ /*
+ * Start of the dedicated data relocation block group, protected by
+ * relocation_bg_lock.
+ */
+ spinlock_t relocation_bg_lock;
+ u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
+
+ u64 nr_global_roots;
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
+
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
@@ -946,19 +1068,90 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
- struct kobject *discard_debug_kobj;
+ struct list_head allocated_roots;
+
+ spinlock_t eb_leak_lock;
+ struct list_head allocated_ebs;
#endif
};
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
-struct btrfs_subvolume_writers {
- struct percpu_counter counter;
- wait_queue_head_t wait;
-};
+/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
/*
* The state of btrfs root
@@ -971,7 +1164,28 @@ enum {
* is used to tell us when more checks are required
*/
BTRFS_ROOT_IN_TRANS_SETUP,
- BTRFS_ROOT_REF_COWS,
+
+ /*
+ * Set if tree blocks of this root can be shared by other roots.
+ * Only subvolume trees and their reloc trees have this bit set.
+ * Conflicts with TRACK_DIRTY bit.
+ *
+ * This affects two things:
+ *
+ * - How balance works
+ * For shareable roots, we need to use reloc tree and do path
+ * replacement for balance, and need various pre/post hooks for
+ * snapshot creation to handle them.
+ *
+ * While for non-shareable trees, we just simply do a tree search
+ * with COW.
+ *
+ * - How dirty roots are tracked
+ * For shareable roots, btrfs_record_root_in_trans() is needed to
+ * track them, while non-subvolume roots have TRACK_DIRTY bit, they
+ * don't need to set this manually.
+ */
+ BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
BTRFS_ROOT_IN_RADIX,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
@@ -989,9 +1203,100 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
+ /* The root has a log tree. Used for subvolume roots and the tree root. */
+ BTRFS_ROOT_HAS_LOG_TREE,
+ /* Qgroup flushing is in progress */
+ BTRFS_ROOT_QGROUP_FLUSHING,
+ /* We started the orphan cleanup for this root. */
+ BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
+ /* This reloc root needs to have its buffers lockdep class reset. */
+ BTRFS_ROOT_RESET_LOCKDEP_CLASS,
+};
+
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
};
/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
+/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
*/
@@ -1007,6 +1312,8 @@ struct btrfs_qgroup_swapped_blocks {
* and for the extent tree extent_root root.
*/
struct btrfs_root {
+ struct rb_node rb_node;
+
struct extent_buffer *node;
struct extent_buffer *commit_root;
@@ -1024,21 +1331,14 @@ struct btrfs_root {
spinlock_t accounting_lock;
struct btrfs_block_rsv *block_rsv;
- /* free ino cache stuff */
- struct btrfs_free_space_ctl *free_ino_ctl;
- enum btrfs_caching_type ino_cache_state;
- spinlock_t ino_cache_lock;
- wait_queue_head_t ino_cache_wait;
- struct btrfs_free_space_ctl *free_ino_pinned;
- u64 ino_cache_progress;
- struct inode *ino_cache_inode;
-
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
struct list_head log_ctxs[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_writers;
atomic_t log_commit[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
int log_transid;
/* No matter the commit succeeds or not*/
@@ -1051,13 +1351,12 @@ struct btrfs_root {
u32 type;
- u64 highest_objectid;
+ u64 free_objectid;
- u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
- /* the dirty list is only used by non-reference counted roots */
+ /* The dirty list is only used by non-shareable roots */
struct list_head dirty_list;
struct list_head root_list;
@@ -1065,8 +1364,6 @@ struct btrfs_root {
spinlock_t log_extents_lock[2];
struct list_head logged_list[2];
- int orphan_cleanup_state;
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
@@ -1131,14 +1428,16 @@ struct btrfs_root {
* root_item_lock.
*/
int dedupe_in_progress;
- struct btrfs_subvolume_writers *subv_writers;
- atomic_t will_be_snapshotted;
+ /* For exclusion of snapshot creation and nocow writes */
+ struct btrfs_drew_lock snapshot_lock;
+
atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */
atomic_t nr_swapfiles;
@@ -1146,29 +1445,111 @@ struct btrfs_root {
/* Record pairs of swapped blocks for qgroup */
struct btrfs_qgroup_swapped_blocks swapped_blocks;
+ /* Used only by log trees, when logging csum items */
+ struct extent_io_tree log_csum_range;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
+};
+
+/* Arguments for btrfs_drop_extents() */
+struct btrfs_drop_extents_args {
+ /* Input parameters */
+
+ /*
+ * If NULL, btrfs_drop_extents() will allocate and free its own path.
+ * If 'replace_extent' is true, this must not be NULL. Also the path
+ * is always released except if 'replace_extent' is true and
+ * btrfs_drop_extents() sets 'extent_inserted' to true, in which case
+ * the path is kept locked.
+ */
+ struct btrfs_path *path;
+ /* Start offset of the range to drop extents from */
+ u64 start;
+ /* End (exclusive, last byte + 1) of the range to drop extents from */
+ u64 end;
+ /* If true drop all the extent maps in the range */
+ bool drop_cache;
+ /*
+ * If true it means we want to insert a new extent after dropping all
+ * the extents in the range. If this is true, the 'extent_item_size'
+ * parameter must be set as well and the 'extent_inserted' field will
+ * be set to true by btrfs_drop_extents() if it could insert the new
+ * extent.
+ * Note: when this is set to true the path must not be NULL.
+ */
+ bool replace_extent;
+ /*
+ * Used if 'replace_extent' is true. Size of the file extent item to
+ * insert after dropping all existing extents in the range
+ */
+ u32 extent_item_size;
+
+ /* Output parameters */
+
+ /*
+ * Set to the minimum between the input parameter 'end' and the end
+ * (exclusive, last byte + 1) of the last dropped extent. This is always
+ * set even if btrfs_drop_extents() returns an error.
+ */
+ u64 drop_end;
+ /*
+ * The number of allocated bytes found in the range. This can be smaller
+ * than the range's length when there are holes in the range.
+ */
+ u64 bytes_found;
+ /*
+ * Only set if 'replace_extent' is true. Set to true if we were able
+ * to insert a replacement extent after dropping all extents in the
+ * range, otherwise set to false by btrfs_drop_extents().
+ * Also, if btrfs_drop_extents() has set this to true it means it
+ * returned with the path locked, otherwise if it has set this to
+ * false it has returned with the path released.
+ */
+ bool extent_inserted;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1206,36 +1587,39 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
*
* Note: don't forget to add new options to btrfs_show_options()
*/
-#define BTRFS_MOUNT_NODATASUM (1 << 0)
-#define BTRFS_MOUNT_NODATACOW (1 << 1)
-#define BTRFS_MOUNT_NOBARRIER (1 << 2)
-#define BTRFS_MOUNT_SSD (1 << 3)
-#define BTRFS_MOUNT_DEGRADED (1 << 4)
-#define BTRFS_MOUNT_COMPRESS (1 << 5)
-#define BTRFS_MOUNT_NOTREELOG (1 << 6)
-#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
-#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
-#define BTRFS_MOUNT_NOSSD (1 << 9)
-#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
-#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
-#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
-#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
-#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
-#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
-#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
-#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
-#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
-#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
-#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
-#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
-#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
-#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
-#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
-#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
-#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
-#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
-#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
-#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
+enum {
+ BTRFS_MOUNT_NODATASUM = (1UL << 0),
+ BTRFS_MOUNT_NODATACOW = (1UL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1UL << 2),
+ BTRFS_MOUNT_SSD = (1UL << 3),
+ BTRFS_MOUNT_DEGRADED = (1UL << 4),
+ BTRFS_MOUNT_COMPRESS = (1UL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1UL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
+ BTRFS_MOUNT_NOSSD = (1UL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
+ BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
+ BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
+};
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -1247,18 +1631,18 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
BTRFS_MOUNT_##opt)
#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (!btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_set_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-{ \
+do { \
if (btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_clear_opt(fs_info->mount_opt, opt); \
-}
+} while (0)
/*
* Requests for changes that need to be done during transaction commit.
@@ -1268,9 +1652,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
* transaction commit)
*/
-#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
-#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
-#define BTRFS_PENDING_COMMIT (2)
+#define BTRFS_PENDING_COMMIT (0)
#define btrfs_test_pending(info, opt) \
test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
@@ -1306,20 +1688,20 @@ do { \
/*
* Inode flags
*/
-#define BTRFS_INODE_NODATASUM (1 << 0)
-#define BTRFS_INODE_NODATACOW (1 << 1)
-#define BTRFS_INODE_READONLY (1 << 2)
-#define BTRFS_INODE_NOCOMPRESS (1 << 3)
-#define BTRFS_INODE_PREALLOC (1 << 4)
-#define BTRFS_INODE_SYNC (1 << 5)
-#define BTRFS_INODE_IMMUTABLE (1 << 6)
-#define BTRFS_INODE_APPEND (1 << 7)
-#define BTRFS_INODE_NODUMP (1 << 8)
-#define BTRFS_INODE_NOATIME (1 << 9)
-#define BTRFS_INODE_DIRSYNC (1 << 10)
-#define BTRFS_INODE_COMPRESS (1 << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+#define BTRFS_INODE_NODATASUM (1U << 0)
+#define BTRFS_INODE_NODATACOW (1U << 1)
+#define BTRFS_INODE_READONLY (1U << 2)
+#define BTRFS_INODE_NOCOMPRESS (1U << 3)
+#define BTRFS_INODE_PREALLOC (1U << 4)
+#define BTRFS_INODE_SYNC (1U << 5)
+#define BTRFS_INODE_IMMUTABLE (1U << 6)
+#define BTRFS_INODE_APPEND (1U << 7)
+#define BTRFS_INODE_NODUMP (1U << 8)
+#define BTRFS_INODE_NOATIME (1U << 9)
+#define BTRFS_INODE_DIRSYNC (1U << 10)
+#define BTRFS_INODE_COMPRESS (1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
#define BTRFS_INODE_FLAG_MASK \
(BTRFS_INODE_NODATASUM | \
@@ -1336,20 +1718,25 @@ do { \
BTRFS_INODE_COMPRESS | \
BTRFS_INODE_ROOT_ITEM_INIT)
+#define BTRFS_INODE_RO_VERITY (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
+
struct btrfs_map_token {
- const struct extent_buffer *eb;
+ struct extent_buffer *eb;
char *kaddr;
unsigned long offset;
};
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
- ((bytes) >> (fs_info)->sb->s_blocksize_bits)
+ ((bytes) >> (fs_info)->sectorsize_bits)
static inline void btrfs_init_map_token(struct btrfs_map_token *token,
struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = NULL;
+ token->kaddr = page_address(eb->pages[0]);
+ token->offset = 0;
}
/* some macros to generate set/get functions for the struct fields. This
@@ -1360,6 +1747,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1373,15 +1770,14 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
sizeof(((type *)0)->member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
- const void *ptr, unsigned long off, \
- struct btrfs_map_token *token); \
-void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \
- unsigned long off, u##bits val, \
- struct btrfs_map_token *token); \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off); \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off, \
+ u##bits val); \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off); \
-void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val);
DECLARE_BTRFS_SETGET_BITS(8)
@@ -1393,69 +1789,66 @@ DECLARE_BTRFS_SETGET_BITS(64)
static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
-static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
-static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\
- const type *s, \
- struct btrfs_map_token *token) \
+static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
+ const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
- return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
-static inline void btrfs_set_token_##name(struct extent_buffer *eb, \
- type *s, u##bits val, \
- struct btrfs_map_token *token) \
+static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
+ type *s, u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
- btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
+ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ const type *p = page_address(eb->pages[0]) + \
+ offset_in_page(eb->start); \
+ return get_unaligned_le##bits(&p->member); \
} \
-static inline void btrfs_set_##name(struct extent_buffer *eb, \
+static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
-static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb,
+static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
total_bytes));
}
-static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb,
+static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}
@@ -1554,13 +1947,13 @@ static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
}
-static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
struct btrfs_chunk *c, int nr)
{
return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
}
-static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
struct btrfs_chunk *c, int nr)
{
return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
@@ -1640,31 +2033,21 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
chunk_offset, 64);
BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
-
-static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
-{
- unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
- return (unsigned long)dev + ptr;
-}
-
BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
generation, 64);
BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
-
-
BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
struct btrfs_disk_key *key)
{
read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}
-static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
struct btrfs_disk_key *key)
{
@@ -1702,12 +2085,6 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
return 0;
}
-BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
-BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
- generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
-
/* struct btrfs_node */
BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
@@ -1716,7 +2093,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
generation, 64);
-static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
{
unsigned long ptr;
ptr = offsetof(struct btrfs_node, ptrs) +
@@ -1724,7 +2101,7 @@ static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
}
-static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
int nr, u64 val)
{
unsigned long ptr;
@@ -1733,7 +2110,7 @@ static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
}
-static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
{
unsigned long ptr;
ptr = offsetof(struct btrfs_node, ptrs) +
@@ -1741,7 +2118,7 @@ static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
}
-static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
int nr, u64 val)
{
unsigned long ptr;
@@ -1759,7 +2136,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr)
void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
-static inline void btrfs_set_node_key(struct extent_buffer *eb,
+static inline void btrfs_set_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr;
@@ -1769,8 +2146,8 @@ static inline void btrfs_set_node_key(struct extent_buffer *eb,
}
/* struct btrfs_item */
-BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
-BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
@@ -1785,25 +2162,36 @@ static inline struct btrfs_item *btrfs_item_nr(int nr)
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
-static inline u32 btrfs_item_end(const struct extent_buffer *eb,
- struct btrfs_item *item)
-{
- return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+#define BTRFS_ITEM_SETGET_FUNCS(member) \
+static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \
+ int slot) \
+{ \
+ return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \
+} \
+static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
+ int slot, u32 val) \
+{ \
+ btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \
+} \
+static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
+ int slot) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ return btrfs_token_raw_item_##member(token, item); \
+} \
+static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
+ int slot, u32 val) \
+{ \
+ struct btrfs_item *item = btrfs_item_nr(slot); \
+ btrfs_set_token_raw_item_##member(token, item, val); \
}
-static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_end(eb, btrfs_item_nr(nr));
-}
-
-static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr)
-{
- return btrfs_item_offset(eb, btrfs_item_nr(nr));
-}
+BTRFS_ITEM_SETGET_FUNCS(offset)
+BTRFS_ITEM_SETGET_FUNCS(size);
-static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr)
+static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
{
- return btrfs_item_size(eb, btrfs_item_nr(nr));
+ return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
}
static inline void btrfs_item_key(const struct extent_buffer *eb,
@@ -1883,6 +2271,52 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Optimized helpers for little-endian architectures where CPU and on-disk
+ * structures have the same endianness and we can skip conversions.
+ */
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
+ const struct btrfs_disk_key *disk_key)
+{
+ memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *cpu_key)
+{
+ memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
+}
+
+static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_node_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
+ struct btrfs_key *cpu_key, int nr)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_item_key(eb, disk_key, nr);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
+ const struct btrfs_dir_item *item,
+ struct btrfs_key *cpu_key)
+{
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;
+
+ btrfs_dir_item_key(eb, item, disk_key);
+}
+
+#else
+
static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
const struct btrfs_disk_key *disk)
{
@@ -1924,6 +2358,8 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
btrfs_disk_key_to_cpu(key, &disk_key);
}
+#endif
+
/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
@@ -1971,16 +2407,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
btrfs_set_header_flags(eb, flags);
}
-static inline unsigned long btrfs_header_fsid(void)
-{
- return offsetof(struct btrfs_header, fsid);
-}
-
-static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
-{
- return offsetof(struct btrfs_header, chunk_tree_uuid);
-}
-
static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
@@ -1996,6 +2422,7 @@ BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
generation, 64);
BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
@@ -2017,14 +2444,21 @@ BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
static inline bool btrfs_root_readonly(const struct btrfs_root *root)
{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
static inline bool btrfs_root_dead(const struct btrfs_root *root)
{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
+static inline u64 btrfs_root_id(const struct btrfs_root *root)
+{
+ return root->root_key.objectid;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2177,8 +2611,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
- log_root_transid, 64);
BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
log_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
@@ -2212,7 +2644,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __const btrfs_get_num_csums(void);
+size_t __attribute_const__ btrfs_get_num_csums(void);
/*
@@ -2226,11 +2658,12 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf)
if (nr == 0)
return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
- return btrfs_item_offset_nr(leaf, nr - 1);
+ return btrfs_item_offset(leaf, nr - 1);
}
/* struct btrfs_file_extent_item */
-BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
+ type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
struct btrfs_file_extent_item, disk_bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
@@ -2239,6 +2672,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
+ struct btrfs_file_extent_item, ram_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
struct btrfs_file_extent_item, disk_num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
@@ -2255,6 +2690,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -2281,9 +2717,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
*/
static inline u32 btrfs_file_extent_inline_item_len(
const struct extent_buffer *eb,
- struct btrfs_item *e)
+ int nr)
{
- return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
+ return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/* btrfs_qgroup_status_item */
@@ -2375,11 +2811,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
/* helper function to cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
#define btrfs_item_ptr_offset(leaf, slot) \
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
- btrfs_item_offset_nr(leaf, slot)))
+ btrfs_item_offset(leaf, slot)))
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
@@ -2424,27 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
@@ -2458,21 +2873,23 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+ int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr);
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -2490,34 +2907,62 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
- int level, int is_data);
+ struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start,
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len);
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
-int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_get_block_group_trimming(struct btrfs_block_group *cache);
-void btrfs_put_block_group_trimming(struct btrfs_block_group *cache);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+/*
+ * Different levels for to flush space when doing space reservations.
+ *
+ * The higher the level, the more methods we try to reclaim space.
+ */
enum btrfs_reserve_flush_enum {
/* If we are in the transaction, we can't flush anything.*/
BTRFS_RESERVE_NO_FLUSH,
+
/*
- * Flushing delalloc may cause deadlock somewhere, in this
- * case, use FLUSH LIMIT
+ * Flush space by:
+ * - Running delayed inode items
+ * - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_LIMIT,
+
+ /*
+ * Flush space by:
+ * - Running delayed inode items
+ * - Running delayed refs
+ * - Running delalloc and waiting for ordered extents
+ * - Allocating a new chunk
+ */
BTRFS_RESERVE_FLUSH_EVICT,
+
+ /*
+ * Flush space by above mentioned methods and by:
+ * - Running delayed iputs
+ * - Committing transaction
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
+
+ /*
+ * Pretty much the same as FLUSH_ALL, but can also steal space from
+ * global rsv.
+ *
+ * Can be interrupted by a fatal signal.
+ */
+ BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
enum btrfs_flush_state {
@@ -2527,20 +2972,22 @@ enum btrfs_flush_state {
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
- ALLOC_CHUNK = 7,
- ALLOC_CHUNK_FORCE = 8,
- RUN_DELAYED_IPUTS = 9,
- COMMIT_TRANS = 10,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -2557,7 +3004,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int level, int *slot);
+ int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
@@ -2568,8 +3015,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2582,7 +3027,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2628,16 +3074,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+ /*
+ * Pointer to an array containing the keys of the items to insert (in
+ * sorted order).
+ */
+ const struct btrfs_key *keys;
+ /* Pointer to an array containing the data size for each item to insert. */
+ const u32 *data_sizes;
+ /*
+ * The sum of data sizes for all items. The caller can compute this while
+ * setting up the data_sizes array, so it ends up being more efficient
+ * than having btrfs_insert_empty_items() or setup_item_for_insert()
+ * doing it, as it would avoid an extra loop over a potentially large
+ * array, and in the case of setup_item_for_insert(), we would be doing
+ * it while holding a write lock on a leaf and often on upper level nodes
+ * too, unnecessarily increasing the size of a critical section.
+ */
+ u32 total_data_size;
+ /* Size of the keys and data_sizes arrays (number of items in the batch). */
+ int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ u32 data_size);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- const struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
+ const struct btrfs_item_batch *batch);
static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2645,13 +3117,52 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
const struct btrfs_key *key,
u32 data_size)
{
- return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+ struct btrfs_item_batch batch;
+
+ batch.keys = key;
+ batch.data_sizes = &data_size;
+ batch.total_data_size = data_size;
+ batch.nr = 1;
+
+ return btrfs_insert_empty_items(trans, root, path, &batch);
}
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
+
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+/*
+ * Search in @root for a given @key, and store the slot found in @found_key.
+ *
+ * @root: The root node of the tree.
+ * @key: The key we are looking for.
+ * @found_key: Will hold the found item.
+ * @path: Holds the current slot/leaf.
+ * @iter_ret: Contains the value returned from btrfs_search_slot or
+ * btrfs_get_next_valid_item, whichever was executed last.
+ *
+ * The @iter_ret is an output variable that will contain the return value of
+ * btrfs_search_slot, if it encountered an error, or the value returned from
+ * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
+ * slot was found, 1 if there were no more leaves, and <0 if there was an error.
+ *
+ * It's recommended to use a separate variable for iter_ret and then use it to
+ * set the function return value so there's no confusion of the 0/1/errno
+ * values stemming from btrfs_search_slot.
+ */
+#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
+ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
+ (iter_ret) >= 0 && \
+ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
+ (path)->slots[0]++ \
+ )
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -2660,14 +3171,25 @@ static inline int btrfs_next_old_item(struct btrfs_root *root,
return btrfs_next_old_leaf(root, p, time_seq);
return 0;
}
+
+/*
+ * Search the tree again to find a leaf with greater keys.
+ *
+ * Returns 0 if it found something or 1 if there are no greater leaves.
+ * Returns < 0 on error.
+ */
+static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(struct extent_buffer *leaf);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- int update_ref, int for_reloc);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+ int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2689,35 +3211,27 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
* If we remount the fs to be R/O or umount the fs, the cleaner needn't do
* anything except sleeping. This function is used to check the status of
* the fs.
+ * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
+ * since setting and checking for SB_RDONLY in the superblock's flags is not
+ * atomic.
*/
static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
{
- return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
+ return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
+ btrfs_fs_closing(fs_info);
}
-static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+static inline void btrfs_set_sb_rdonly(struct super_block *sb)
{
- kfree(fs_info->balance_ctl);
- kfree(fs_info->delayed_root);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info->quota_root);
- kfree(fs_info->uuid_root);
- kfree(fs_info->free_space_root);
- kfree(fs_info->super_copy);
- kfree(fs_info->super_for_commit);
- kvfree(fs_info);
+ sb->s_flags |= SB_RDONLY;
+ set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
-/* tree mod log functions from ctree.c */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
+{
+ sb->s_flags &= ~SB_RDONLY;
+ clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
+}
/* root-item.c */
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
@@ -2750,9 +3264,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
- int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
- u64));
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -2769,7 +3281,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
@@ -2801,48 +3313,13 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-item.c */
-int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 index);
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, u64 objectid);
-int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path,
- struct btrfs_key *location, int mod);
-
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
-
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
- int slot, const char *name,
- int name_len);
-struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
- const char *name, int name_len);
/* file-item.c */
-struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 offset, u8 *dst);
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -2850,88 +3327,113 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- u64 file_start, int contig);
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit);
+ struct list_head *list, int search_commit,
+ bool nowait);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
const bool new_inline,
struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes);
+ u64 *ram_bytes, bool nowait, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len);
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index);
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
- int front);
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode, u64 new_size,
- u32 min_type);
-
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front);
+
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context);
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid);
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /*
+ * Output from btrfs_new_inode_prepare(), input to
+ * btrfs_create_new_inode().
+ */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+};
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits);
+ u32 bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags);
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root);
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
+ struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+ struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
@@ -2942,23 +3444,55 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
- u64 end, int uptodate);
+int btrfs_writepage_cow_fixup(struct page *page);
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before);
+
extern const struct dentry_operations btrfs_dentry_operations;
+/* Inode locking type flags, by default the exclusive lock is taken */
+#define BTRFS_ILOCK_SHARED (1U << 0)
+#define BTRFS_ILOCK_TRY (1U << 1)
+#define BTRFS_ILOCK_MMAP (1U << 2)
+
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_pages);
+ u64 newer_than, unsigned long max_to_defrag);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -2968,37 +3502,33 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
+ struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned);
extern const struct file_operations btrfs_file_operations;
-int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache,
- int replace_extent,
- u32 extent_item_size,
- int *key_inserted);
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode, u64 start,
- u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
- const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args);
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached);
+ struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags);
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait);
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3008,17 +3538,37 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
-#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...) \
+do { \
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
+ _btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
__printf(2, 3)
__cold
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
#else
+
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
@@ -3171,6 +3721,52 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
#define ASSERT(expr) (void)(expr)
#endif
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
+/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb: target extent buffer
+ * @start: offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+ unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+ * to PAGE_SIZE, thus adding it won't cause any difference.
+ *
+ * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+ * to the eb, thus we have to take the eb->start into consideration.
+ */
+ return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ *
+ * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+ * and have ensured that all tree blocks are contained in one page,
+ * thus we always get index == 0.
+ */
+ return offset >> PAGE_SHIFT;
+}
+
/*
* Use that for functions that are conditionally exported for sanity tests but
* otherwise static
@@ -3198,21 +3794,26 @@ const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, bool first_hit);
+
+bool __cold abort_should_print_stack(int errno);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
- * detected, that way the exact line number is reported.
+ * detected, that way the exact stack trace is reported for some errors.
*/
#define btrfs_abort_transaction(trans, errno) \
do { \
+ bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- if ((errno) != -EIO) { \
- WARN(1, KERN_DEBUG \
+ first = true; \
+ if (WARN(abort_should_print_stack(errno), \
+ KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
- (errno)); \
+ (errno))) { \
+ /* Stack trace printed. */ \
} else { \
btrfs_debug((trans)->fs_info, \
"Transaction aborted (error %d)", \
@@ -3220,15 +3821,34 @@ do { \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno)); \
+ __LINE__, (errno), first); \
} while (0)
+#ifdef CONFIG_PRINTK_INDEX
+
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
+do { \
+ printk_index_subsys_emit( \
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
+ KERN_CRIT, fmt); \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
} while (0)
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+#endif
+
+#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+ &(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
+
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3372,17 +3992,19 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
-int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir);
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
-static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return 0;
+ return -EOPNOTSUPP;
}
#endif
@@ -3392,8 +4014,8 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
@@ -3401,6 +4023,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3412,16 +4038,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
-static inline void btrfs_init_full_stripe_locks_tree(
- struct btrfs_full_stripe_locks_tree *locks_root)
-{
- locks_root->root = RB_ROOT;
- mutex_init(&locks_root->lock);
-}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
@@ -3429,21 +4048,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
btrfs_bio_counter_sub(fs_info, 1);
}
-/* reada.c */
-struct reada_control {
- struct btrfs_fs_info *fs_info; /* tree to prefetch */
- struct btrfs_key key_start;
- struct btrfs_key key_end; /* exclusive */
- atomic_t elems;
- struct kref refcnt;
- wait_queue_head_t wait;
-};
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *start, struct btrfs_key *end);
-int btrfs_reada_wait(void *handle);
-void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct extent_buffer *eb, int err);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -3458,13 +4062,40 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+ size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+ struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+ struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ return 0;
+}
+
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
+#endif
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
@@ -3476,4 +4107,22 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+ return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
+/*
+ * We use page status Private2 to indicate there is an ordered extent with
+ * unfinished IO.
+ *
+ * Rename the Private2 accessors to Ordered, to improve readability.
+ */
+#define PageOrdered(page) PagePrivate2(page)
+#define SetPageOrdered(page) SetPagePrivate2(page)
+#define ClearPageOrdered(page) ClearPagePrivate2(page)
+#define folio_test_ordered(folio) folio_test_private_2(folio)
+#define folio_set_ordered(folio) folio_set_private_2(folio)
+#define folio_clear_ordered(folio) folio_clear_private_2(folio)
+
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 4cdac4d834f5..118b2e20b2e1 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -9,136 +9,129 @@
#include "qgroup.h"
#include "block-group.h"
+/*
+ * HOW DOES THIS WORK
+ *
+ * There are two stages to data reservations, one for data and one for metadata
+ * to handle the new extents and checksums generated by writing data.
+ *
+ *
+ * DATA RESERVATION
+ * The general flow of the data reservation is as follows
+ *
+ * -> Reserve
+ * We call into btrfs_reserve_data_bytes() for the user request bytes that
+ * they wish to write. We make this reservation and add it to
+ * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
+ * for the range and carry on if this is buffered, or follow up trying to
+ * make a real allocation if we are pre-allocating or doing O_DIRECT.
+ *
+ * -> Use
+ * At writepages()/prealloc/O_DIRECT time we will call into
+ * btrfs_reserve_extent() for some part or all of this range of bytes. We
+ * will make the allocation and subtract space_info->bytes_may_use by the
+ * original requested length and increase the space_info->bytes_reserved by
+ * the allocated length. This distinction is important because compression
+ * may allocate a smaller on disk extent than we previously reserved.
+ *
+ * -> Allocation
+ * finish_ordered_io() will insert the new file extent item for this range,
+ * and then add a delayed ref update for the extent tree. Once that delayed
+ * ref is written the extent size is subtracted from
+ * space_info->bytes_reserved and added to space_info->bytes_used.
+ *
+ * Error handling
+ *
+ * -> By the reservation maker
+ * This is the simplest case, we haven't completed our operation and we know
+ * how much we reserved, we can simply call
+ * btrfs_free_reserved_data_space*() and it will be removed from
+ * space_info->bytes_may_use.
+ *
+ * -> After the reservation has been made, but before cow_file_range()
+ * This is specifically for the delalloc case. You must clear
+ * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
+ * be subtracted from space_info->bytes_may_use.
+ *
+ * METADATA RESERVATION
+ * The general metadata reservation lifetimes are discussed elsewhere, this
+ * will just focus on how it is used for delalloc space.
+ *
+ * We keep track of two things on a per inode bases
+ *
+ * ->outstanding_extents
+ * This is the number of file extent items we'll need to handle all of the
+ * outstanding DELALLOC space we have in this inode. We limit the maximum
+ * size of an extent, so a large contiguous dirty area may require more than
+ * one outstanding_extent, which is why count_max_extents() is used to
+ * determine how many outstanding_extents get added.
+ *
+ * ->csum_bytes
+ * This is essentially how many dirty bytes we have for this inode, so we
+ * can calculate the number of checksum items we would have to add in order
+ * to checksum our outstanding data.
+ *
+ * We keep a per-inode block_rsv in order to make it easier to keep track of
+ * our reservation. We use btrfs_calculate_inode_block_rsv_size() to
+ * calculate the current theoretical maximum reservation we would need for the
+ * metadata for this inode. We call this and then adjust our reservation as
+ * necessary, either by attempting to reserve more space, or freeing up excess
+ * space.
+ *
+ * OUTSTANDING_EXTENTS HANDLING
+ *
+ * ->outstanding_extents is used for keeping track of how many extents we will
+ * need to use for this inode, and it will fluctuate depending on where you are
+ * in the life cycle of the dirty data. Consider the following normal case for
+ * a completely clean inode, with a num_bytes < our maximum allowed extent size
+ *
+ * -> reserve
+ * ->outstanding_extents += 1 (current value is 1)
+ *
+ * -> set_delalloc
+ * ->outstanding_extents += 1 (current value is 2)
+ *
+ * -> btrfs_delalloc_release_extents()
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * We must call this once we are done, as we hold our reservation for the
+ * duration of our operation, and then assume set_delalloc will update the
+ * counter appropriately.
+ *
+ * -> add ordered extent
+ * ->outstanding_extents += 1 (current value is 2)
+ *
+ * -> btrfs_clear_delalloc_extent
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * -> finish_ordered_io/btrfs_remove_ordered_extent
+ * ->outstanding_extents -= 1 (current value is 0)
+ *
+ * Each stage is responsible for their own accounting of the extent, thus
+ * making error handling and cleanup easier.
+ */
+
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
-int btrfs_check_data_free_space(struct inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start,
+ u64 len, bool noflush)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
int ret;
/* align the range */
@@ -146,16 +139,24 @@ int btrfs_check_data_free_space(struct inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+ if (noflush)
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ else if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ ret = btrfs_reserve_data_bytes(fs_info, len, flush);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
- if (ret < 0)
- btrfs_free_reserved_data_space_noquota(inode, start, len);
- else
+ if (ret < 0) {
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ } else {
ret = 0;
+ }
return ret;
}
@@ -167,21 +168,15 @@ int btrfs_check_data_free_space(struct inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
- /* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, fs_info->sectorsize) -
- round_down(start, fs_info->sectorsize);
- start = round_down(start, fs_info->sectorsize);
+ ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
@@ -191,27 +186,29 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
/* Make sure the range is aligned to sectorsize */
- len = round_up(start + len, root->fs_info->sectorsize) -
- round_down(start, root->fs_info->sectorsize);
- start = round_down(start, root->fs_info->sectorsize);
+ len = round_up(start + len, fs_info->sectorsize) -
+ round_down(start, fs_info->sectorsize);
+ start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
/**
- * btrfs_inode_rsv_release - release any excessive reservation.
- * @inode - the inode we need to release from.
- * @qgroup_free - free or convert qgroup meta.
- * Unlike normal operation, qgroup meta reservation needs to know if we are
- * freeing qgroup reservation or just converting it into per-trans. Normally
- * @qgroup_free is true for error handling, and false for normal release.
+ * Release any excessive reservation
+ *
+ * @inode: the inode we need to release from
+ * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
+ * meta reservation needs to know if we are freeing qgroup
+ * reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal
+ * release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
@@ -228,8 +225,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
- released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
- &qgroup_to_release);
+ released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
@@ -280,11 +277,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
- u64 num_bytes, u64 *meta_reserve,
- u64 *qgroup_reserve)
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
{
- u64 nr_extents = count_max_extents(num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+ u64 nr_extents = count_max_extents(fs_info, num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
@@ -298,7 +295,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -317,7 +315,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
- if (btrfs_is_free_space_inode(inode)) {
+ if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
@@ -328,6 +326,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
@@ -339,12 +338,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
- &qgroup_reserve);
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
@@ -357,9 +357,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* needs to free the reservation we just made.
*/
spin_lock(&inode->lock);
- nr_extents = count_max_extents(num_bytes);
+ nr_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += num_bytes;
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -376,7 +376,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * Release a metadata reservation for an inode
+ *
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
* @qgroup_free: free qgroup reservation or convert it to per-trans reservation
@@ -419,7 +420,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
unsigned num_extents;
spin_lock(&inode->lock);
- num_extents = count_max_extents(num_bytes);
+ num_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, -num_extents);
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -455,36 +456,41 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
- if (ret < 0)
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
+ if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
+ extent_changeset_free(*reserved);
+ *reserved = NULL;
+ }
return ret;
}
/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @start: start position of the space already reserved
- * @len: the len of the space already reserved
- * @release_bytes: the len of the space we consumed or didn't use
+ * Release data and metadata space for delalloc
+ *
+ * @inode: inode we're releasing space for
+ * @reserved: list of changed/reserved ranges
+ * @start: start position of the space already reserved
+ * @len: length of the space already reserved
+ * @qgroup_free: should qgroup reserved-space also be freed
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+ btrfs_delalloc_release_metadata(inode, len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 54466fbd7075..e07d46043455 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -6,18 +6,19 @@
struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
-int btrfs_check_data_free_space(struct inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len,
+ bool noflush);
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index d3e15e1d4a91..cac5169eaf8d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -13,6 +13,7 @@
#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
+#include "inode-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -51,18 +52,6 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->p_list);
}
-static inline int btrfs_is_continuous_delayed_item(
- struct btrfs_delayed_item *item1,
- struct btrfs_delayed_item *item2)
-{
- if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
- item1->key.objectid == item2->key.objectid &&
- item1->key.type == item2->key.type &&
- item1->key.offset + 1 == item2->key.offset)
- return 1;
- return 0;
-}
-
static struct btrfs_delayed_node *btrfs_get_delayed_node(
struct btrfs_inode *btrfs_inode)
{
@@ -313,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
}
-static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+ struct btrfs_delayed_node *node,
+ enum btrfs_delayed_item_type type)
{
struct btrfs_delayed_item *item;
+
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
if (item) {
item->data_len = data_len;
- item->ins_or_del = 0;
+ item->type = type;
item->bytes_reserved = 0;
- item->delayed_node = NULL;
+ item->delayed_node = node;
+ RB_CLEAR_NODE(&item->rb_node);
+ INIT_LIST_HEAD(&item->log_list);
+ item->logged = false;
refcount_set(&item->refs, 1);
}
return item;
@@ -330,89 +325,46 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
/*
* __btrfs_lookup_delayed_item - look up the delayed item by key
* @delayed_node: pointer to the delayed node
- * @key: the key to look up
- * @prev: used to store the prev item if the right item isn't found
- * @next: used to store the next item if the right item isn't found
+ * @index: the dir index value to lookup (offset of a dir index key)
*
* Note: if we don't find the right item, we will return the prev item and
* the next item.
*/
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
- struct btrfs_key *key,
- struct btrfs_delayed_item **prev,
- struct btrfs_delayed_item **next)
+ u64 index)
{
- struct rb_node *node, *prev_node = NULL;
+ struct rb_node *node = root->rb_node;
struct btrfs_delayed_item *delayed_item = NULL;
- int ret = 0;
-
- node = root->rb_node;
while (node) {
delayed_item = rb_entry(node, struct btrfs_delayed_item,
rb_node);
- prev_node = node;
- ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
- if (ret < 0)
+ if (delayed_item->index < index)
node = node->rb_right;
- else if (ret > 0)
+ else if (delayed_item->index > index)
node = node->rb_left;
else
return delayed_item;
}
- if (prev) {
- if (!prev_node)
- *prev = NULL;
- else if (ret < 0)
- *prev = delayed_item;
- else if ((node = rb_prev(prev_node)) != NULL) {
- *prev = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *prev = NULL;
- }
-
- if (next) {
- if (!prev_node)
- *next = NULL;
- else if (ret > 0)
- *next = delayed_item;
- else if ((node = rb_next(prev_node)) != NULL) {
- *next = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *next = NULL;
- }
return NULL;
}
-static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
- NULL, NULL);
-}
-
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
- struct btrfs_delayed_item *ins,
- int action)
+ struct btrfs_delayed_item *ins)
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
struct btrfs_delayed_item *item;
- int cmp;
bool leftmost = true;
- if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (action == BTRFS_DELAYED_DELETION_ITEM)
- root = &delayed_node->del_root;
else
- BUG();
+ root = &delayed_node->del_root;
+
p = &root->rb_root.rb_node;
node = &ins->rb_node;
@@ -421,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
item = rb_entry(parent_node, struct btrfs_delayed_item,
rb_node);
- cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0) {
+ if (item->index < ins->index) {
p = &(*p)->rb_right;
leftmost = false;
- } else if (cmp > 0) {
+ } else if (item->index > ins->index) {
p = &(*p)->rb_left;
} else {
return -EEXIST;
@@ -434,33 +385,16 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
- ins->delayed_node = delayed_node;
- ins->ins_or_del = action;
- if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
- action == BTRFS_DELAYED_INSERTION_ITEM &&
- ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->index >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->index + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
return 0;
}
-static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_INSERTION_ITEM);
-}
-
-static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_DELETION_ITEM);
-}
-
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
@@ -476,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
- /* Not associated with any delayed_node */
- if (!delayed_item->delayed_node)
+ /* Not inserted, ignore it. */
+ if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
+
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
- BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
- delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
- if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_item->delayed_node->ins_root;
else
root = &delayed_item->delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
+ RB_CLEAR_NODE(&delayed_item->rb_node);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -545,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 num_bytes;
int ret;
@@ -570,9 +503,15 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid,
+ item->delayed_node->inode_id,
num_bytes, 1);
- item->bytes_reserved = num_bytes;
+ /*
+ * For insertions we track reserved metadata space by accounting
+ * for the number of leaves that will be used, based on the delayed
+ * node's index_items_size field.
+ */
+ if (item->type == BTRFS_DELAYED_DELETION_ITEM)
+ item->bytes_reserved = num_bytes;
}
return ret;
@@ -593,16 +532,29 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
* to release/reserve qgroup space.
*/
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, item->bytes_reserved,
- 0);
- btrfs_block_rsv_release(fs_info, rsv,
- item->bytes_reserved);
+ item->delayed_node->inode_id,
+ item->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
+}
+
+static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
+ unsigned int num_leaves)
+{
+ struct btrfs_fs_info *fs_info = node->root->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
+
+ /* There are no space reservations during log replay, bail out. */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
+ bytes, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
}
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -627,38 +579,23 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- fs_info->nodesize, true);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
- /*
- * Since we're under a transaction reserve_metadata_bytes could
- * try to commit the transaction which will make it return
- * EAGAIN to make us stop the transaction we have, so return
- * ENOSPC instead so that btrfs_dirty_inode knows what to do.
- */
- if (ret == -EAGAIN) {
- ret = -ENOSPC;
+ /* NO_FLUSH could only fail with -ENOSPC */
+ ASSERT(ret == 0 || ret == -ENOSPC);
+ if (ret)
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
- }
- if (!ret) {
- node->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(fs_info,
- "delayed_inode",
- btrfs_ino(inode),
- num_bytes, 1);
- } else {
- btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize);
- }
- return ret;
+ } else {
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
- btrfs_ino(inode), num_bytes, 1);
+ node->inode_id, num_bytes, 1);
node->bytes_reserved = num_bytes;
}
@@ -677,8 +614,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
rsv = &fs_info->delayed_block_rsv;
trace_btrfs_space_reservation(fs_info, "delayed_inode",
node->inode_id, node->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, rsv,
- node->bytes_reserved);
+ btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
if (qgroup_free)
btrfs_qgroup_free_meta_prealloc(node->root,
node->bytes_reserved);
@@ -689,182 +625,201 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * This helper will insert some continuous items into the same leaf according
- * to the free space of the leaf.
+ * Insert a single delayed item or a batch of delayed items, as many as possible
+ * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
+ * in the rbtree, and if there's a gap between two consecutive dir index items,
+ * then it means at some point we had delayed dir indexes to add but they got
+ * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
+ * into the subvolume tree. Dir index keys also have their offsets coming from a
+ * monotonically increasing counter, so we can't get new keys with an offset that
+ * fits within a gap between delayed dir index items.
*/
-static int btrfs_batch_insert_items(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *item)
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *first_item)
{
- struct btrfs_delayed_item *curr, *next;
- int free_space;
- int total_data_size = 0, total_size = 0;
- struct extent_buffer *leaf;
- char *data_ptr;
- struct btrfs_key *keys;
- u32 *data_size;
- struct list_head head;
- int slot;
- int nitems;
- int i;
- int ret = 0;
-
- BUG_ON(!path->nodes[0]);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_node *node = first_item->delayed_node;
+ LIST_HEAD(item_list);
+ struct btrfs_delayed_item *curr;
+ struct btrfs_delayed_item *next;
+ const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
+ struct btrfs_item_batch batch;
+ struct btrfs_key first_key;
+ const u32 first_data_size = first_item->data_len;
+ int total_size;
+ char *ins_data = NULL;
+ int ret;
+ bool continuous_keys_only = false;
- leaf = path->nodes[0];
- free_space = btrfs_leaf_free_space(leaf);
- INIT_LIST_HEAD(&head);
+ lockdep_assert_held(&node->mutex);
- next = item;
- nitems = 0;
+ /*
+ * During normal operation the delayed index offset is continuously
+ * increasing, so we can batch insert all items as there will not be any
+ * overlapping keys in the tree.
+ *
+ * The exception to this is log replay, where we may have interleaved
+ * offsets in the tree, so our batch needs to be continuous keys only in
+ * order to ensure we do not end up with out of order items in our leaf.
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ continuous_keys_only = true;
/*
- * count the number of the continuous items that we can insert in batch
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
*/
- while (total_size + next->data_len + sizeof(struct btrfs_item) <=
- free_space) {
- total_data_size += next->data_len;
- total_size += next->data_len + sizeof(struct btrfs_item);
- list_add_tail(&next->tree_list, &head);
- nitems++;
+ ASSERT(first_item->bytes_reserved == 0);
+
+ list_add_tail(&first_item->tree_list, &item_list);
+ batch.total_data_size = first_data_size;
+ batch.nr = 1;
+ total_size = first_data_size + sizeof(struct btrfs_item);
+ curr = first_item;
+
+ while (true) {
+ int next_size;
- curr = next;
next = __btrfs_next_delayed_item(curr);
if (!next)
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
+ /*
+ * We cannot allow gaps in the key space if we're doing log
+ * replay.
+ */
+ if (continuous_keys_only && (next->index != curr->index + 1))
break;
- }
- if (!nitems) {
- ret = 0;
- goto out;
- }
+ ASSERT(next->bytes_reserved == 0);
- /*
- * we need allocate some memory space, but it might cause the task
- * to sleep, so we set all locked nodes in the path to blocking locks
- * first.
- */
- btrfs_set_path_blocking(path);
+ next_size = next->data_len + sizeof(struct btrfs_item);
+ if (total_size + next_size > max_size)
+ break;
- keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
- if (!keys) {
- ret = -ENOMEM;
- goto out;
+ list_add_tail(&next->tree_list, &item_list);
+ batch.nr++;
+ total_size += next_size;
+ batch.total_data_size += next->data_len;
+ curr = next;
}
- data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
- if (!data_size) {
- ret = -ENOMEM;
- goto error;
+ if (batch.nr == 1) {
+ first_key.objectid = node->inode_id;
+ first_key.type = BTRFS_DIR_INDEX_KEY;
+ first_key.offset = first_item->index;
+ batch.keys = &first_key;
+ batch.data_sizes = &first_data_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ int i = 0;
+
+ ins_data = kmalloc(batch.nr * sizeof(u32) +
+ batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ list_for_each_entry(curr, &item_list, tree_list) {
+ ins_keys[i].objectid = node->inode_id;
+ ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[i].offset = curr->index;
+ ins_sizes[i] = curr->data_len;
+ i++;
+ }
}
- /* get keys of all the delayed items */
- i = 0;
- list_for_each_entry(next, &head, tree_list) {
- keys[i] = next->key;
- data_size[i] = next->data_len;
- i++;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
+ if (ret)
+ goto out;
+
+ list_for_each_entry(curr, &item_list, tree_list) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ path->slots[0]++;
}
- /* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ /*
+ * Now release our path before releasing the delayed items and their
+ * metadata reservations, so that we don't block other tasks for more
+ * time than needed.
+ */
+ btrfs_release_path(path);
- /* insert the dir index items */
- slot = path->slots[0];
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- data_ptr = btrfs_item_ptr(leaf, slot, char);
- write_extent_buffer(leaf, &curr->data,
- (unsigned long)data_ptr,
- curr->data_len);
- slot++;
+ ASSERT(node->index_item_leaves > 0);
- btrfs_delayed_item_release_metadata(root, curr);
+ /*
+ * For normal operations we will batch an entire leaf's worth of delayed
+ * items, so if there are more items to process we can decrement
+ * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
+ *
+ * However for log replay we may not have inserted an entire leaf's
+ * worth of items, we may have not had continuous items, so decrementing
+ * here would mess up the index_item_leaves accounting. For this case
+ * only clean up the accounting when there are no items left.
+ */
+ if (next && !continuous_keys_only) {
+ /*
+ * We inserted one batch of items into a leaf a there are more
+ * items to flush in a future batch, now release one unit of
+ * metadata space from the delayed block reserve, corresponding
+ * the leaf we just flushed to.
+ */
+ btrfs_delayed_item_release_leaves(node, 1);
+ node->index_item_leaves--;
+ } else if (!next) {
+ /*
+ * There are no more items to insert. We can have a number of
+ * reserved leaves > 1 here - this happens when many dir index
+ * items are added and then removed before they are flushed (file
+ * names with a very short life, never span a transaction). So
+ * release all remaining leaves.
+ */
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+ list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-
-error:
- kfree(data_size);
- kfree(keys);
out:
+ kfree(ins_data);
return ret;
}
-/*
- * This helper can just do simple insertion that needn't extend item for new
- * data, such as directory name index insertion, inode insertion.
- */
-static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *delayed_item)
-{
- struct extent_buffer *leaf;
- char *ptr;
- int ret;
-
- ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
- delayed_item->data_len);
- if (ret < 0 && ret != -EEXIST)
- return ret;
-
- leaf = path->nodes[0];
-
- ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-
- write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
- delayed_item->data_len);
- btrfs_mark_buffer_dirty(leaf);
-
- btrfs_delayed_item_release_metadata(root, delayed_item);
- return 0;
-}
-
-/*
- * we insert an item first, then if there are some continuous items, we try
- * to insert those items into the same leaf.
- */
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_insertion_item(node);
- if (!curr)
- goto insert_end;
-
- ret = btrfs_insert_delayed_item(trans, root, path, curr);
- if (ret < 0) {
- btrfs_release_path(path);
- goto insert_end;
- }
+ while (ret == 0) {
+ struct btrfs_delayed_item *curr;
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
- /* insert the continuous items into the same leaf */
- path->slots[0]++;
- btrfs_batch_insert_items(root, path, curr);
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_insertion_item(node);
+ if (!curr) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+ ret = btrfs_insert_delayed_item(trans, root, path, curr);
+ mutex_unlock(&node->mutex);
}
- btrfs_release_delayed_item(prev);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-insert_end:
- mutex_unlock(&node->mutex);
return ret;
}
@@ -873,62 +828,77 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ const u64 ino = item->delayed_node->inode_id;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- struct list_head head;
- int nitems, i, last_item;
- int ret = 0;
+ struct extent_buffer *leaf = path->nodes[0];
+ LIST_HEAD(batch_list);
+ int nitems, slot, last_slot;
+ int ret;
+ u64 total_reserved_size = item->bytes_reserved;
- BUG_ON(!path->nodes[0]);
+ ASSERT(leaf != NULL);
- leaf = path->nodes[0];
+ slot = path->slots[0];
+ last_slot = btrfs_header_nritems(leaf) - 1;
+ /*
+ * Our caller always gives us a path pointing to an existing item, so
+ * this can not happen.
+ */
+ ASSERT(slot <= last_slot);
+ if (WARN_ON(slot > last_slot))
+ return -ENOENT;
- i = path->slots[0];
- last_item = btrfs_header_nritems(leaf) - 1;
- if (i > last_item)
- return -ENOENT; /* FIXME: Is errno suitable? */
+ nitems = 1;
+ curr = item;
+ list_add_tail(&curr->tree_list, &batch_list);
- next = item;
- INIT_LIST_HEAD(&head);
- btrfs_item_key_to_cpu(leaf, &key, i);
- nitems = 0;
/*
- * count the number of the dir index items that we can delete in batch
+ * Keep checking if the next delayed item matches the next item in the
+ * leaf - if so, we can add it to the batch of items to delete from the
+ * leaf.
*/
- while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
- list_add_tail(&next->tree_list, &head);
- nitems++;
+ while (slot < last_slot) {
+ struct btrfs_key key;
- curr = next;
next = __btrfs_next_delayed_item(curr);
if (!next)
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
- break;
-
- i++;
- if (i > last_item)
+ slot++;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
break;
- btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems++;
+ curr = next;
+ list_add_tail(&curr->tree_list, &batch_list);
+ total_reserved_size += curr->bytes_reserved;
}
- if (!nitems)
- return 0;
-
ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
if (ret)
- goto out;
+ return ret;
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(root, curr);
+ /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
+ if (total_reserved_size > 0) {
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we
+ * don't need to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+ total_reserved_size, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
+ total_reserved_size, NULL);
+ }
+
+ list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- return ret;
+ return 0;
}
static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
@@ -936,43 +906,57 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
+ struct btrfs_key key;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_deletion_item(node);
- if (!curr)
- goto delete_fail;
+ key.objectid = node->inode_id;
+ key.type = BTRFS_DIR_INDEX_KEY;
+
+ while (ret == 0) {
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_deletion_item(node);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+
+ key.offset = item->index;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ /*
+ * There's no matching item in the leaf. This means we
+ * have already deleted this item in a past run of the
+ * delayed items. We ignore errors when running delayed
+ * items from an async context, through a work queue job
+ * running btrfs_async_run_delayed_root(), and don't
+ * release delayed items that failed to complete. This
+ * is because we will retry later, and at transaction
+ * commit time we always run delayed items and will
+ * then deal with errors if they fail to run again.
+ *
+ * So just release delayed items for which we can't find
+ * an item in the tree, and move to the next item.
+ */
+ btrfs_release_path(path);
+ btrfs_release_delayed_item(item);
+ ret = 0;
+ } else if (ret == 0) {
+ ret = btrfs_batch_delete_items(trans, root, path, item);
+ btrfs_release_path(path);
+ }
- ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- if (ret < 0)
- goto delete_fail;
- else if (ret > 0) {
/*
- * can't find the item which the node points to, so this node
- * is invalid, just drop it.
+ * We unlock and relock on each iteration, this is to prevent
+ * blocking other tasks for too long while we are being run from
+ * the async context (work queue job). Those tasks are typically
+ * running system calls like creat/mkdir/rename/unlink/etc which
+ * need to add delayed items to this delayed node.
*/
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- btrfs_release_delayed_item(prev);
- ret = 0;
- btrfs_release_path(path);
- if (curr) {
- mutex_unlock(&node->mutex);
- goto do_again;
- } else
- goto delete_fail;
+ mutex_unlock(&node->mutex);
}
- btrfs_batch_delete_items(trans, root, path, curr);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-delete_fail:
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return ret;
}
@@ -993,14 +977,16 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
{
- struct btrfs_delayed_root *delayed_root;
- ASSERT(delayed_node->root);
- clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
- delayed_node->count--;
+ if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
+ struct btrfs_delayed_root *delayed_root;
- delayed_root = delayed_node->root->fs_info->delayed_root;
- finish_one_item(delayed_root);
+ ASSERT(delayed_node->root);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+ }
}
static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1025,12 +1011,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
mod = 1;
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
- if (ret > 0) {
- btrfs_release_path(path);
- return -ENOENT;
- } else if (ret < 0) {
- return ret;
- }
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1040,7 +1024,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
- goto no_iref;
+ goto out;
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf))
@@ -1062,12 +1046,19 @@ again:
btrfs_del_item(trans, root, path);
out:
btrfs_release_delayed_iref(node);
-no_iref:
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the improper
+ * counts behind.
+ */
+ if (ret && ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
+
return ret;
search:
@@ -1075,6 +1066,7 @@ search:
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto err_out;
@@ -1139,13 +1131,12 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
int ret = 0;
bool count = (nr > 0);
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return -EIO;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
@@ -1153,7 +1144,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
delayed_root = fs_info->delayed_root;
curr_node = btrfs_first_delayed_node(delayed_root);
- while (curr_node && (!count || (count && nr--))) {
+ while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
@@ -1210,7 +1201,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
btrfs_release_delayed_node(delayed_node);
return -ENOMEM;
}
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
@@ -1255,7 +1245,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
ret = -ENOMEM;
goto trans_out;
}
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
@@ -1324,7 +1313,6 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
if (!delayed_node)
break;
- path->leave_spinning = 1;
root = delayed_node->root;
trans = btrfs_join_transaction(root);
@@ -1427,24 +1415,28 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_disk_key *disk_key, u8 type,
u64 index)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
+ bool reserve_leaf_space;
+ u32 data_len;
int ret;
delayed_node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
- delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+ delayed_node,
+ BTRFS_DELAYED_INSERTION_ITEM);
if (!delayed_item) {
ret = -ENOMEM;
goto release_node;
}
- delayed_item->key.objectid = btrfs_ino(dir);
- delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
- delayed_item->key.offset = index;
+ delayed_item->index = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
@@ -1454,15 +1446,51 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ data_len = delayed_item->data_len + sizeof(struct btrfs_item);
mutex_lock(&delayed_node->mutex);
- ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+
+ if (delayed_node->index_item_leaves == 0 ||
+ delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
+ delayed_node->curr_index_batch_size = data_len;
+ reserve_leaf_space = true;
+ } else {
+ delayed_node->curr_index_batch_size += data_len;
+ reserve_leaf_space = false;
+ }
+
+ if (reserve_leaf_space) {
+ ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
+ /*
+ * Space was reserved for a dir index item insertion when we
+ * started the transaction, so getting a failure here should be
+ * impossible.
+ */
+ if (WARN_ON(ret)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_item(delayed_item);
+ goto release_node;
+ }
+
+ delayed_node->index_item_leaves++;
+ } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+ }
+
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1479,19 +1507,48 @@ release_node:
static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node,
- struct btrfs_key *key)
+ u64 index)
{
struct btrfs_delayed_item *item;
mutex_lock(&node->mutex);
- item = __btrfs_lookup_delayed_insertion_item(node, key);
+ item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
return 1;
}
- btrfs_delayed_item_release_metadata(node->root, item);
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(item->bytes_reserved == 0);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * If there's only one leaf reserved, we can decrement this item from the
+ * current batch, otherwise we can not because we don't know which leaf
+ * it belongs to. With the current limit on delayed items, we rarely
+ * accumulate enough dir index items to fill more than one leaf (even
+ * when using a leaf size of 4K).
+ */
+ if (node->index_item_leaves == 1) {
+ const u32 data_len = item->data_len + sizeof(struct btrfs_item);
+
+ ASSERT(node->curr_index_batch_size >= data_len);
+ node->curr_index_batch_size -= data_len;
+ }
+
btrfs_release_delayed_item(item);
+
+ /* If we now have no more dir index items, we can release all leaves. */
+ if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
mutex_unlock(&node->mutex);
return 0;
}
@@ -1501,31 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
- struct btrfs_key item_key;
int ret;
node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(node))
return PTR_ERR(node);
- item_key.objectid = btrfs_ino(dir);
- item_key.type = BTRFS_DIR_INDEX_KEY;
- item_key.offset = index;
-
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
- &item_key);
+ ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
if (!ret)
goto end;
- item = btrfs_alloc_delayed_item(0);
+ item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
ret = -ENOMEM;
goto end;
}
- item->key = item_key;
+ item->index = index;
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1538,7 +1589,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
mutex_lock(&node->mutex);
- ret = __btrfs_add_delayed_deletion_item(node, item);
+ ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1590,8 +1641,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- inode_unlock_shared(inode);
- inode_lock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1654,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int ret = 0;
list_for_each_entry(curr, del_list, readdir_list) {
- if (curr->key.offset > index)
+ if (curr->index > index)
break;
- if (curr->key.offset == index) {
+ if (curr->index == index) {
ret = 1;
break;
}
@@ -1690,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < ctx->pos) {
+ if (curr->index < ctx->pos) {
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- ctx->pos = curr->key.offset;
+ ctx->pos = curr->index;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
@@ -1722,6 +1773,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
struct inode *inode)
{
+ u64 flags;
+
btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
@@ -1734,7 +1787,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
inode_peek_iversion(inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1760,6 +1815,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
@@ -1779,6 +1835,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
@@ -1789,7 +1847,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
btrfs_stack_inode_sequence(inode_item));
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
- BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+ btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
@@ -1814,27 +1873,28 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
}
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node;
int ret = 0;
- delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item,
+ &inode->vfs_inode);
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode),
- delayed_node);
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
@@ -1897,12 +1957,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
}
+ if (delayed_node->index_item_leaves > 0) {
+ btrfs_delayed_item_release_leaves(delayed_node,
+ delayed_node->index_item_leaves);
+ delayed_node->index_item_leaves = 0;
+ }
+
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
btrfs_delayed_item_release_metadata(root, curr_item);
@@ -1911,8 +1976,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_item(prev_item);
}
- if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
- btrfs_release_delayed_iref(delayed_node);
+ btrfs_release_delayed_iref(delayed_node);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
@@ -1983,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
}
}
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_insertion_item(node);
+ while (item) {
+ /*
+ * It's possible that the item is already in a log list. This
+ * can happen in case two tasks are trying to log the same
+ * directory. For example if we have tasks A and task B:
+ *
+ * Task A collected the delayed items into a log list while
+ * under the inode's log_mutex (at btrfs_log_inode()), but it
+ * only releases the items after logging the inodes they point
+ * to (if they are new inodes), which happens after unlocking
+ * the log mutex;
+ *
+ * Task B enters btrfs_log_inode() and acquires the log_mutex
+ * of the same directory inode, before task B releases the
+ * delayed items. This can happen for example when logging some
+ * inode we need to trigger logging of its parent directory, so
+ * logging two files that have the same parent directory can
+ * lead to this.
+ *
+ * If this happens, just ignore delayed items already in a log
+ * list. All the tasks logging the directory are under a log
+ * transaction and whichever finishes first can not sync the log
+ * before the other completes and leaves the log transaction.
+ */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, ins_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(node);
+ while (item) {
+ /* It may be non-empty, for the same reason mentioned above. */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, del_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_delayed_item *next;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+
+ list_for_each_entry_safe(item, next, ins_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ list_for_each_entry_safe(item, next, del_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 74ae226ffaf0..0163ca637a96 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,9 +16,10 @@
#include <linux/refcount.h>
#include "ctree.h"
-/* types of the delayed item */
-#define BTRFS_DELAYED_INSERTION_ITEM 1
-#define BTRFS_DELAYED_DELETION_ITEM 2
+enum btrfs_delayed_item_type {
+ BTRFS_DELAYED_INSERTION_ITEM,
+ BTRFS_DELAYED_DELETION_ITEM
+};
struct btrfs_delayed_root {
spinlock_t lock;
@@ -58,19 +59,43 @@ struct btrfs_delayed_node {
u64 index_cnt;
unsigned long flags;
int count;
+ /*
+ * The size of the next batch of dir index items to insert (if this
+ * node is from a directory inode). Protected by @mutex.
+ */
+ u32 curr_index_batch_size;
+ /*
+ * Number of leaves reserved for inserting dir index items (if this
+ * node belongs to a directory inode). This may be larger then the
+ * actual number of leaves we end up using. Protected by @mutex.
+ */
+ u32 index_item_leaves;
};
struct btrfs_delayed_item {
struct rb_node rb_node;
- struct btrfs_key key;
+ /* Offset value of the corresponding dir index key. */
+ u64 index;
struct list_head tree_list; /* used for batch insert/delete items */
struct list_head readdir_list; /* used for readdir items */
+ /*
+ * Used when logging a directory.
+ * Insertions and deletions to this list are protected by the parent
+ * delayed node's mutex.
+ */
+ struct list_head log_list;
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
refcount_t refs;
- int ins_or_del;
- u32 data_len;
- char data[0];
+ enum btrfs_delayed_item_type type:8;
+ /*
+ * Track if this delayed item was already logged.
+ * Protected by the mutex of the parent delayed inode.
+ */
+ bool logged;
+ /* The maximum leaf size is 64K, so u16 is more than enough. */
+ u16 data_len;
+ char data[];
};
static inline void btrfs_init_delayed_root(
@@ -110,7 +135,8 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+ struct btrfs_root *root,
+ struct btrfs_inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
@@ -132,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+
/* for init */
int __init btrfs_delayed_inode_init(void);
void __cold btrfs_delayed_inode_exit(void);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index dfdb7d4f8406..36a3debe9493 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -11,6 +11,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
+#include "tree-mod-log.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -69,9 +70,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
}
/**
- * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
- * @fs_info - the fs_info for our fs.
- * @nr - the number of items to drop.
+ * Release a ref head's reservation
+ *
+ * @fs_info: the filesystem
+ * @nr: number of items to drop
*
* This drops the delayed ref head's count from the delayed refs rsv and frees
* any excess reservation we had.
@@ -82,8 +84,18 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
- released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
- NULL);
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
+ released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, released, 0);
@@ -107,18 +119,30 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
num_bytes = btrfs_calc_insert_metadata_size(fs_info,
trans->delayed_ref_updates);
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = 0;
+ delayed_rsv->full = false;
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
}
/**
- * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
- * @fs_info - the fs info for our fs.
- * @src - the source block rsv to transfer from.
- * @num_bytes - the number of bytes to transfer.
+ * Transfer bytes to our delayed refs rsv
+ *
+ * @fs_info: the filesystem
+ * @src: source block rsv to transfer from
+ * @num_bytes: number of bytes to transfer
*
* This transfers up to the num_bytes amount from the src rsv to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
@@ -151,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
if (num_bytes)
delayed_refs_rsv->reserved += num_bytes;
if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = 1;
+ delayed_refs_rsv->full = true;
spin_unlock(&delayed_refs_rsv->lock);
if (num_bytes)
@@ -163,9 +187,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
}
/**
- * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
- * @fs_info - the fs_info for our fs.
- * @flush - control how we can flush for this reservation.
+ * Refill based on our delayed refs usage
+ *
+ * @fs_info: the filesystem
+ * @flush: control how we can flush for this reservation.
*
* This will refill the delayed block_rsv up to 1 items size worth of space and
* will return -ENOSPC if we can't make the reservation.
@@ -188,8 +213,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
- num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -492,16 +516,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
- read_lock(&fs_info->tree_mod_log_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
+ seq = btrfs_tree_mod_log_lowest_seq(fs_info);
again:
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
@@ -515,23 +530,16 @@ again:
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
- struct seq_list *elem;
int ret = 0;
+ u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
- read_lock(&fs_info->tree_mod_log_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- if (seq >= elem->seq) {
- btrfs_debug(fs_info,
- "holding back delayed_ref %#x.%x, lowest is %#x.%x",
- (u32)(seq >> 32), (u32)seq,
- (u32)(elem->seq >> 32), (u32)elem->seq);
- ret = 1;
- }
+ if (min_seq != 0 && seq >= min_seq) {
+ btrfs_debug(fs_info,
+ "holding back delayed_ref %llu, lowest is %llu",
+ seq, min_seq);
+ ret = 1;
}
- read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
@@ -649,8 +657,7 @@ inserted:
*/
static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
- struct btrfs_delayed_ref_head *update,
- int *old_ref_mod_ret)
+ struct btrfs_delayed_ref_head *update)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
@@ -702,8 +709,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing->total_ref_mod;
- if (old_ref_mod_ret)
- *old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
existing->total_ref_mod += update->ref_mod;
@@ -725,6 +730,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates += csum_leaves;
}
}
+
spin_unlock(&existing->lock);
}
@@ -799,8 +805,7 @@ static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
- int action, int *qrecord_inserted_ret,
- int *old_ref_mod, int *new_ref_mod)
+ int action, int *qrecord_inserted_ret)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -822,8 +827,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- update_existing_head_ref(trans, existing, head_ref,
- old_ref_mod);
+ update_existing_head_ref(trans, existing, head_ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -831,8 +835,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- if (old_ref_mod)
- *old_ref_mod = 0;
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
@@ -846,8 +848,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
- if (new_ref_mod)
- *new_ref_mod = head_ref->total_ref_mod;
return head_ref;
}
@@ -910,8 +910,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
*/
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
@@ -928,10 +927,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -943,8 +941,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(generic_ref->real_root) &&
- is_fstree(generic_ref->tree_ref.root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
@@ -960,14 +956,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.root, action, ref_type);
- ref->root = generic_ref->tree_ref.root;
+ generic_ref->tree_ref.owning_root, action,
+ ref_type);
+ ref->root = generic_ref->tree_ref.owning_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.root, 0, action, false,
- is_system);
+ generic_ref->tree_ref.owning_root, 0, action,
+ false, is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -978,8 +975,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -997,7 +993,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(fs_info, record);
+ btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
@@ -1007,8 +1003,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod)
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_data_ref *ref;
@@ -1021,7 +1016,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.ref_root;
+ u64 ref_root = generic_ref->data_ref.owning_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1050,8 +1045,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- is_fstree(ref_root) &&
- is_fstree(generic_ref->real_root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
@@ -1074,8 +1067,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1094,7 +1086,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
@@ -1110,15 +1102,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
- false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
- NULL, NULL, NULL);
+ NULL);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 1c977e6d45dc..d6304b690ec4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op {
u8 level;
bool update_key;
bool update_flags;
- bool is_data;
u64 flags_to_set;
};
@@ -135,6 +134,11 @@ struct btrfs_delayed_data_ref {
u64 offset;
};
+enum btrfs_delayed_ref_flags {
+ /* Indicate that we are flushing delayed refs for the commit */
+ BTRFS_DELAYED_REFS_FLUSHING,
+};
+
struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
@@ -158,12 +162,7 @@ struct btrfs_delayed_ref_root {
u64 pending_csums;
- /*
- * set when the tree is flushing before a transaction commit,
- * used by the throttling code to decide if new updates need
- * to be run right away
- */
- int flushing;
+ unsigned long flags;
u64 run_delayed_start;
@@ -186,8 +185,8 @@ enum btrfs_ref_type {
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Root which refers to this data extent */
- u64 ref_root;
+ /* Original root this data extent belongs to */
+ u64 owning_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -210,11 +209,11 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which refers to this tree block.
+ * Root which owns this tree block.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 root;
+ u64 owning_root;
/* For non-skinny metadata, no special member needed */
};
@@ -231,17 +230,10 @@ struct btrfs_ref {
*/
bool skip_qgroup;
- /*
- * Optional. For which root is this modification.
- * Mostly used for qgroup optimization.
- *
- * When unset, data/tree ref init code will populate it.
- * In certain cases, we're modifying reference for a different root.
- * E.g. COW fs tree blocks for balance.
- * In that case, tree_ref::root will be fs tree, but we're doing this
- * for reloc tree, then we should set @real_root to reloc tree.
- */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ /* Through which root is this modification. */
u64 real_root;
+#endif
u64 bytenr;
u64 len;
@@ -271,26 +263,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
}
static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root)
+ int level, u64 root, u64 mod_root, bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = root;
+ generic_ref->real_root = mod_root ?: root;
+#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.root = root;
+ generic_ref->tree_ref.owning_root = root;
generic_ref->type = BTRFS_REF_METADATA;
+ if (skip_qgroup || !(is_fstree(root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
+
}
static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
- u64 ref_root, u64 ino, u64 offset)
+ u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+ bool skip_qgroup)
{
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
- if (!generic_ref->real_root)
- generic_ref->real_root = ref_root;
- generic_ref->data_ref.ref_root = ref_root;
+ generic_ref->real_root = mod_root ?: ref_root;
+#endif
+ generic_ref->data_ref.owning_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
+ if (skip_qgroup || !(is_fstree(ref_root) &&
+ (!mod_root || is_fstree(mod_root))))
+ generic_ref->skip_qgroup = true;
+ else
+ generic_ref->skip_qgroup = false;
}
static inline struct btrfs_delayed_extent_op *
@@ -326,6 +332,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
}
}
+static inline u64 btrfs_ref_head_to_space_flags(
+ struct btrfs_delayed_ref_head *head_ref)
+{
+ if (head_ref->is_data)
+ return BTRFS_BLOCK_GROUP_DATA;
+ else if (head_ref->is_system)
+ return BTRFS_BLOCK_GROUP_SYSTEM;
+ return BTRFS_BLOCK_GROUP_METADATA;
+}
+
static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
{
if (refcount_dec_and_test(&head->refs))
@@ -334,12 +350,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod);
+ u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2ca2a09d0e23..61e58066b5fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -21,17 +21,56 @@
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
+#include "zoned.h"
+#include "block-group.h"
+
+/*
+ * Device replace overview
+ *
+ * [Objective]
+ * To copy all extents (both new and on-disk) from source device to target
+ * device, while still keeping the filesystem read-write.
+ *
+ * [Method]
+ * There are two main methods involved:
+ *
+ * - Write duplication
+ *
+ * All new writes will be written to both target and source devices, so even
+ * if replace gets canceled, sources device still contains up-to-date data.
+ *
+ * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
+ * Start: btrfs_dev_replace_start()
+ * End: btrfs_dev_replace_finishing()
+ * Content: Latest data/metadata
+ *
+ * - Copy existing extents
+ *
+ * This happens by re-using scrub facility, as scrub also iterates through
+ * existing extents from commit root.
+ *
+ * Location: scrub_write_block_to_dev_replace() from
+ * scrub_block_complete()
+ * Content: Data/meta from commit root.
+ *
+ * Due to the content difference, we need to avoid nocow write when dev-replace
+ * is happening. This is done by marking the block group read-only and waiting
+ * for NOCOW writes.
+ *
+ * After replace is done, the finishing part is done by swapping the target and
+ * source devices.
+ *
+ * Location: btrfs_dev_replace_update_device_in_mapping_tree() from
+ * btrfs_dev_replace_finishing()
+ */
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -43,6 +82,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
+ if (!dev_root)
+ return 0;
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -55,6 +97,16 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
no_valid_dev_replace_entry_found:
+ /*
+ * We don't have a replace item or it's corrupted. If there is
+ * a replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ btrfs_err(fs_info,
+ "found replace target device without a valid replace item");
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
@@ -76,7 +128,7 @@ no_valid_dev_replace_entry_found:
}
slot = path->slots[0];
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
@@ -107,16 +159,25 @@ no_valid_dev_replace_entry_found:
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
+ /*
+ * We don't have an active replace item but if there is a
+ * replace target, fail the mount.
+ */
+ if (btrfs_find_device(fs_info->fs_devices, &args)) {
+ btrfs_err(fs_info,
+"replace without active item, run 'device scan --forget' on the target device");
+ ret = -EUCLEAN;
+ } else {
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
+ }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL, true);
- dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL, true);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+ args.devid = src_devid;
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -182,15 +243,15 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding) {
+ if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
@@ -202,10 +263,16 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return PTR_ERR(bdev);
}
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ btrfs_err(fs_info,
+ "dev-replace: zoned type of target device mismatch with filesystem");
+ ret = -EINVAL;
+ goto error;
+ }
+
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -215,8 +282,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
- if (i_size_read(bdev->bd_inode) <
- btrfs_device_get_total_bytes(srcdev)) {
+ if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
@@ -237,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
rcu_assign_pointer(device->name, name);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error;
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
@@ -255,13 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
+ device->fs_devices = fs_devices;
+
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ goto error;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
*device_out = device;
return 0;
@@ -312,7 +385,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/*
* need to delete old one and insert a new one.
* Since no attempt is made to recover any old state, if the
@@ -391,6 +464,154 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
}
+static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *src_dev)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_block_group *cache;
+ struct btrfs_trans_handle *trans;
+ int iter_ret = 0;
+ int ret = 0;
+ u64 chunk_offset;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ /* Ensure we don't have pending new block group */
+ spin_lock(&fs_info->trans_lock);
+ while (fs_info->running_transaction &&
+ !list_empty(&fs_info->running_transaction->dev_update_list)) {
+ spin_unlock(&fs_info->trans_lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret == -ENOENT) {
+ spin_lock(&fs_info->trans_lock);
+ continue;
+ } else {
+ goto unlock;
+ }
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret)
+ goto unlock;
+
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = src_dev->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *leaf = path->nodes[0];
+
+ if (found_key.objectid != src_dev->devid)
+ break;
+
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+
+ chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!cache)
+ continue;
+
+ set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
+ btrfs_put_block_group(cache);
+ }
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+ btrfs_free_path(path);
+unlock:
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ return ret;
+}
+
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 chunk_offset = cache->start;
+ int num_extents, cur_extent;
+ int i;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ spin_lock(&cache->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
+ spin_unlock(&cache->lock);
+ return true;
+ }
+ spin_unlock(&cache->lock);
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(em));
+ map = em->map_lookup;
+
+ num_extents = 0;
+ cur_extent = 0;
+ for (i = 0; i < map->num_stripes; i++) {
+ /* We have more device extent to copy */
+ if (srcdev != map->stripes[i].dev)
+ continue;
+
+ num_extents++;
+ if (physical == map->stripes[i].physical)
+ cur_extent = i;
+ }
+
+ free_extent_map(em);
+
+ if (num_extents > 1 && cur_extent < num_extents - 1) {
+ /*
+ * Has more stripes on this device. Keep this block group
+ * readonly until we finish all the stripes.
+ */
+ return false;
+ }
+
+ /* Last stripe on this device */
+ clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
+
+ return true;
+}
+
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
@@ -432,6 +653,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
return ret;
+ ret = mark_block_group_to_copy(fs_info, src_device);
+ if (ret)
+ return ret;
+
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -472,13 +697,18 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -559,10 +789,68 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
@@ -590,7 +878,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -612,12 +900,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
@@ -633,8 +921,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -645,9 +939,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
@@ -676,8 +971,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(src_device, tgt_device);
- list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
- fs_info->fs_devices->rw_devices++;
+ list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -700,12 +995,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -714,33 +1011,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
-}
-
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ btrfs_rm_dev_replace_free_srcdev(src_device);
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+ return 0;
}
/*
@@ -851,8 +1124,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -943,7 +1215,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -980,7 +1252,7 @@ static int btrfs_dev_replace_kthread(void *data)
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
@@ -1011,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
-{
- percpu_counter_inc(&fs_info->dev_replace.bio_counter);
-}
-
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 60b70dacc299..6084b313056a 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -7,6 +7,10 @@
#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
@@ -18,5 +22,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical);
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 863367c2c620..72fb2c518a2b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
- struct btrfs_item *item;
struct extent_buffer *leaf;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
@@ -41,10 +40,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
- BUG_ON(data_size > btrfs_item_size(leaf, item));
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
+ ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
return (struct btrfs_dir_item *)ptr;
}
@@ -127,7 +125,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
btrfs_cpu_key_to_disk(&disk_key, location);
@@ -171,10 +168,40 @@ out_free:
return 0;
}
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, const char *name,
+ int name_len, int mod)
+{
+ const int ins_len = (mod < 0 ? -1 : 0);
+ const int cow = (mod != 0);
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
/*
- * lookup a directory item based on name. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -182,23 +209,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
-
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -212,7 +234,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int slot;
struct btrfs_path *path;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -221,20 +242,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
- /* return back any errors */
- if (ret < 0)
- goto out;
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ /* Nothing found, we're safe */
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto out;
+ }
- /* nothing found, we're safe */
- if (ret > 0) {
- ret = 0;
- goto out;
+ if (ret < 0)
+ goto out;
}
/* we found an item, look for our name in the item */
- di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
if (di) {
/* our exact name was found */
ret = -EEXIST;
@@ -248,7 +269,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
data_size = sizeof(*di) + name_len;
leaf = path->nodes[0];
slot = path->slots[0];
- if (data_size + btrfs_item_size_nr(leaf, slot) +
+ if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
ret = -EOVERFLOW;
} else {
@@ -261,35 +282,42 @@ out:
}
/*
- * lookup a directory item based on index. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory index item by name and index number.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
*
- * The name is used to make sure the index really points to the name you were
- * looking for.
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod)
{
- int ret;
+ struct btrfs_dir_item *di;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = objectid;
+ key.offset = index;
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
}
struct btrfs_dir_item *
@@ -297,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ERR_PTR(ret);
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-
- while (1) {
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_for_each_slot(root, &key, &key, path, ret) {
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
@@ -334,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
name, name_len);
if (di)
return di;
-
- path->slots[0]++;
}
- return NULL;
+ /* Adjust return code if the key was not found in the next leaf. */
+ if (ret > 0)
+ ret = 0;
+
+ return ERR_PTR(ret);
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
@@ -346,21 +355,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
/*
@@ -382,7 +388,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
- total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ total_len = btrfs_item_size(leaf, path->slots[0]);
while (cur < total_len) {
this_len = sizeof(*dir_item) +
btrfs_dir_name_len(leaf, dir_item) +
@@ -418,7 +424,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di);
- item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_len = btrfs_item_size(leaf, path->slots[0]);
if (sub_item_len == item_len) {
ret = btrfs_del_item(trans, root, path);
} else {
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 5615320fa659..e1b7bd927d69 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group(
}
/**
- * peek_discard_list - wrap find_next_block_group()
- * @discard_ctl: discard control
+ * Wrap find_next_block_group()
+ *
+ * @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
+ * @now: time when discard was invoked, in ns
*
* This wraps find_next_block_group() and sets the block_group to be in use.
* discard_state's control flow is managed here. Variables related to
@@ -199,16 +201,15 @@ static struct btrfs_block_group *find_next_block_group(
static struct btrfs_block_group *peek_discard_list(
struct btrfs_discard_ctl *discard_ctl,
enum btrfs_discard_state *discard_state,
- int *discard_index)
+ int *discard_index, u64 now)
{
struct btrfs_block_group *block_group;
- const u64 now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
again:
block_group = find_next_block_group(discard_ctl, now);
- if (block_group && now > block_group->discard_eligible_time) {
+ if (block_group && now >= block_group->discard_eligible_time) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group))
@@ -222,12 +223,11 @@ again:
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
discard_ctl->block_group = block_group;
+ }
+ if (block_group) {
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
- } else {
- block_group = NULL;
}
-
spin_unlock(&discard_ctl->lock);
return block_group;
@@ -330,32 +330,19 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
btrfs_discard_schedule_work(discard_ctl, false);
}
-/**
- * btrfs_discard_schedule_work - responsible for scheduling the discard work
- * @discard_ctl: discard control
- * @override: override the current timer
- *
- * Discards are issued by a delayed workqueue item. @override is used to
- * update the current delay as the baseline delay interval is reevaluated on
- * transaction commit. This is also maxed with any other rate limit.
- */
-void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
- bool override)
+static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ u64 now, bool override)
{
struct btrfs_block_group *block_group;
- const u64 now = ktime_get_ns();
-
- spin_lock(&discard_ctl->lock);
if (!btrfs_run_discard_work(discard_ctl))
- goto out;
-
+ return;
if (!override && delayed_work_pending(&discard_ctl->work))
- goto out;
+ return;
block_group = find_next_block_group(discard_ctl, now);
if (block_group) {
- unsigned long delay = discard_ctl->delay;
+ u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
/*
@@ -366,9 +353,9 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
if (kbps_limit && discard_ctl->prev_discard) {
u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
u64 bps_delay = div64_u64(discard_ctl->prev_discard *
- MSEC_PER_SEC, bps_limit);
+ NSEC_PER_SEC, bps_limit);
- delay = max(delay, msecs_to_jiffies(bps_delay));
+ delay = max(delay, bps_delay);
}
/*
@@ -378,13 +365,39 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
if (now < block_group->discard_eligible_time) {
u64 bg_timeout = block_group->discard_eligible_time - now;
- delay = max(delay, nsecs_to_jiffies(bg_timeout));
+ delay = max(delay, bg_timeout);
+ }
+
+ if (override && discard_ctl->prev_discard) {
+ u64 elapsed = now - discard_ctl->prev_discard_time;
+
+ if (delay > elapsed)
+ delay -= elapsed;
+ else
+ delay = 0;
}
mod_delayed_work(discard_ctl->discard_workers,
- &discard_ctl->work, delay);
+ &discard_ctl->work, nsecs_to_jiffies(delay));
}
-out:
+}
+
+/*
+ * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * @discard_ctl: discard control
+ * @override: override the current timer
+ *
+ * Discards are issued by a delayed workqueue item. @override is used to
+ * update the current delay as the baseline delay interval is reevaluated on
+ * transaction commit. This is also maxed with any other rate limit.
+ */
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override)
+{
+ const u64 now = ktime_get_ns();
+
+ spin_lock(&discard_ctl->lock);
+ __btrfs_discard_schedule_work(discard_ctl, now, override);
spin_unlock(&discard_ctl->lock);
}
@@ -429,13 +442,18 @@ static void btrfs_discard_workfn(struct work_struct *work)
int discard_index = 0;
u64 trimmed = 0;
u64 minlen = 0;
+ u64 now = ktime_get_ns();
discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
block_group = peek_discard_list(discard_ctl, &discard_state,
- &discard_index);
+ &discard_index, now);
if (!block_group || !btrfs_run_discard_work(discard_ctl))
return;
+ if (now < block_group->discard_eligible_time) {
+ btrfs_discard_schedule_work(discard_ctl, false);
+ return;
+ }
/* Perform discarding */
minlen = discard_minlen[discard_index];
@@ -465,8 +483,6 @@ static void btrfs_discard_workfn(struct work_struct *work)
discard_ctl->discard_extent_bytes += trimmed;
}
- discard_ctl->prev_discard = trimmed;
-
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
if (discard_state == BTRFS_DISCARD_BITMAPS) {
@@ -482,11 +498,13 @@ static void btrfs_discard_workfn(struct work_struct *work)
}
}
+ now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
+ discard_ctl->prev_discard = trimmed;
+ discard_ctl->prev_discard_time = now;
discard_ctl->block_group = NULL;
+ __btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
-
- btrfs_discard_schedule_work(discard_ctl, false);
}
/**
@@ -519,7 +537,6 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
s64 discardable_bytes;
u32 iops_limit;
unsigned long delay;
- unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
discardable_extents = atomic_read(&discard_ctl->discardable_extents);
if (!discardable_extents)
@@ -550,12 +567,13 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
iops_limit = READ_ONCE(discard_ctl->iops_limit);
if (iops_limit)
- lower_limit = max_t(unsigned long, lower_limit,
- MSEC_PER_SEC / iops_limit);
+ delay = MSEC_PER_SEC / iops_limit;
+ else
+ delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
- delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
- delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
- discard_ctl->delay = msecs_to_jiffies(delay);
+ delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC,
+ BTRFS_DISCARD_MAX_DELAY_MSEC);
+ discard_ctl->delay_ms = delay;
spin_unlock(&discard_ctl->lock);
}
@@ -563,15 +581,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
/**
* btrfs_discard_update_discardable - propagate discard counters
* @block_group: block_group of interest
- * @ctl: free_space_ctl of @block_group
*
* This propagates deltas of counters up to the discard_ctl. It maintains a
* current counter and a previous counter passing the delta up to the global
* stat. Then the current counter value becomes the previous counter value.
*/
-void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl)
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
+ struct btrfs_free_space_ctl *ctl;
struct btrfs_discard_ctl *discard_ctl;
s32 extents_delta;
s64 bytes_delta;
@@ -581,8 +598,10 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
!btrfs_is_block_group_data_only(block_group))
return;
+ ctl = block_group->free_space_ctl;
discard_ctl = &block_group->fs_info->discard_ctl;
+ lockdep_assert_held(&ctl->tree_lock);
extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
ctl->discardable_extents[BTRFS_STAT_PREV];
if (extents_delta) {
@@ -605,7 +624,7 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
- * order of operations is changed. In the normal sychronous discard path, the
+ * order of operations is changed. In the normal synchronous discard path, the
* block groups are trimmed via a single large trim in transaction commit. This
* is ultimately what we are trying to avoid with asynchronous discard. Thus,
* it must be done before going down the unused_bgs path.
@@ -619,6 +638,7 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
bg_list) {
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
}
spin_unlock(&fs_info->unused_bgs_lock);
@@ -683,10 +703,11 @@ void btrfs_discard_init(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
discard_ctl->prev_discard = 0;
+ discard_ctl->prev_discard_time = 0;
atomic_set(&discard_ctl->discardable_extents, 0);
atomic64_set(&discard_ctl->discardable_bytes, 0);
discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
- discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
+ discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
discard_ctl->kbps_limit = 0;
discard_ctl->discard_extent_bytes = 0;
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index 21a15776dac4..57b9202f427f 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
@@ -28,8 +28,7 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
/* Update operations */
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
-void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl);
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group);
/* Setup/cleanup operations */
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6c9a6a8e6c8..d99bf7c64611 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -7,7 +7,6 @@
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/slab.h>
@@ -30,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
@@ -42,6 +40,9 @@
#include "ref-verify.h"
#include "block-group.h"
#include "discard.h"
+#include "space-info.h"
+#include "zoned.h"
+#include "subpage.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -50,8 +51,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
-static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -64,38 +63,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete. This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
- struct bio *bio;
- bio_end_io_t *end_io;
- void *private;
- struct btrfs_fs_info *info;
- blk_status_t status;
- enum btrfs_wq_endio_type metadata;
- struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
- btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
- sizeof(struct btrfs_end_io_wq),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_end_io_wq_cache)
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
- kmem_cache_destroy(btrfs_end_io_wq_cache);
+ if (fs_info->csum_shash)
+ crypto_free_shash(fs_info->csum_shash);
}
/*
@@ -104,190 +75,41 @@ void __cold btrfs_end_io_wq_exit(void)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- void *private_data;
+ struct inode *inode;
struct bio *bio;
extent_submit_bio_start_t *submit_bio_start;
int mirror_num;
- /*
- * bio_offset is optional, can be used if the pages in the bio
- * can't tell us where in the file the bio should go
- */
- u64 bio_offset;
+
+ /* Optional parameter for submit_bio_start used by direct io */
+ u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
};
/*
- * Lockdep class keys for extent_buffer->lock's in this root. For a given
- * eb, the lockdep key is determined by the btrfs_root it belongs to and
- * the level the eb occupies in the tree.
- *
- * Different roots are used for different purposes and may nest inside each
- * other and they require separate keysets. As lockdep keys should be
- * static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->root_key.objectid. This ensures that all special purpose
- * roots have separate keysets.
- *
- * Lock-nesting across peer nodes is always done with the immediate parent
- * node locked thus preventing deadlock. As lockdep doesn't know this, use
- * subclass to avoid triggering lockdep warning in such cases.
- *
- * The key is set by the readpage_end_io_hook after the buffer has passed
- * csum validation but before the pages are unlocked. It is also set by
- * btrfs_init_new_buffer on freshly allocated blocks.
- *
- * We also add a check to make sure the highest level of the tree is the
- * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
- * needs update as well.
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# if BTRFS_MAX_LEVEL != 8
-# error
-# endif
-
-static struct btrfs_lockdep_keyset {
- u64 id; /* root objectid */
- const char *name_stem; /* lock name stem */
- char names[BTRFS_MAX_LEVEL + 1][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
-} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
- { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
- { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
- { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
- { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
- { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
- { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
- { .id = 0, .name_stem = "tree" },
-};
-
-void __init btrfs_init_lockdep(void)
-{
- int i, j;
-
- /* initialize lockdep class names */
- for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
- struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
-
- for (j = 0; j < ARRAY_SIZE(ks->names); j++)
- snprintf(ks->names[j], sizeof(ks->names[j]),
- "btrfs-%s-%02d", ks->name_stem, j);
- }
-}
-
-void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
- int level)
-{
- struct btrfs_lockdep_keyset *ks;
-
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
-
- /* find the matching keyset, id 0 is the default entry */
- for (ks = btrfs_lockdep_keysets; ks->id; ks++)
- if (ks->id == objectid)
- break;
-
- lockdep_set_class_and_name(&eb->lock,
- &ks->keys[level], ks->names[level]);
-}
-
-#endif
-
-/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
- *
- * Returns error if the extent buffer cannot be mapped.
*/
-static int csum_tree_block(struct extent_buffer *buf, u8 *result)
+static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
+ const int num_pages = num_extent_pages(buf);
+ const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- unsigned long len;
- unsigned long cur_len;
- unsigned long offset = BTRFS_CSUM_SIZE;
char *kaddr;
- unsigned long map_start;
- unsigned long map_len;
- int err;
+ int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ first_page_part - BTRFS_CSUM_SIZE);
- len = buf->len - offset;
-
- while (len > 0) {
- /*
- * Note: we don't need to check for the err == 1 case here, as
- * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
- * and 'min_len = 32' and the currently implemented mapping
- * algorithm we cannot cross a page boundary.
- */
- err = map_private_extent_buffer(buf, offset, 32,
- &kaddr, &map_start, &map_len);
- if (WARN_ON(err))
- return err;
- cur_len = min(len, map_len - (offset - map_start));
- crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
- len -= cur_len;
- offset += cur_len;
+ for (i = 1; i < num_pages; i++) {
+ kaddr = page_address(buf->pages[i]);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
-
crypto_shash_final(shash, result);
-
- return 0;
}
/*
@@ -302,7 +124,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -310,39 +131,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- if (need_lock) {
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
-
- lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
btrfs_err_rl(eb->fs_info,
- "parent transid verify failed on %llu wanted %llu found %llu",
- eb->start,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror,
parent_transid, btrfs_header_generation(eb));
ret = 1;
-
- /*
- * Things reading via commit roots that don't have normal protection,
- * like send, can have a really old block in cache that may point at a
- * block that has been freed and re-allocated. So don't clear uptodate
- * if we find an eb that is under IO (dirty/writeback) because we could
- * end up reading in the stale data and then writing it back out and
- * making everybody very sad.
- */
- if (!extent_buffer_under_io(eb))
- clear_extent_buffer_uptodate(eb);
+ clear_extent_buffer_uptodate(eb);
out:
- unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
- if (need_lock)
- btrfs_tree_read_unlock_blocking(eb);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -363,27 +166,23 @@ static bool btrfs_supported_super_csum(u16 csum_type)
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
*/
-static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
- char *raw_disk_sb)
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb)
{
- struct btrfs_super_block *disk_sb =
- (struct btrfs_super_block *)raw_disk_sb;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
- crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_final(shash, result);
+ crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
- if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
+ if (memcmp(disk_sb->csum, result, fs_info->csum_size))
return 1;
return 0;
@@ -455,9 +254,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
-static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+int btrfs_read_extent_buffer(struct extent_buffer *eb,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
@@ -506,164 +305,202 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
return ret;
}
-/*
- * checksum a dirty tree block before IO. This has extra checks to make sure
- * we only fill in the checksum field in the first page of a multi-page block
- */
-
-static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
+static int csum_one_extent_buffer(struct extent_buffer *eb)
{
- u64 start = page_offset(page);
- u64 found_start;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
u8 result[BTRFS_CSUM_SIZE];
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- struct extent_buffer *eb;
int ret;
- eb = (struct extent_buffer *)page->private;
- if (page != eb->pages[0])
- return 0;
-
- found_start = btrfs_header_bytenr(eb);
- /*
- * Please do not consolidate these warnings into a single if.
- * It is useful to know what went wrong.
- */
- if (WARN_ON(found_start != start))
- return -EUCLEAN;
- if (WARN_ON(!PageUptodate(page)))
- return -EUCLEAN;
-
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
- btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
-
- if (csum_tree_block(eb, result))
- return -EINVAL;
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
+ csum_tree_block(eb, result);
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
else
ret = btrfs_check_leaf_full(eb);
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Also check the generation, the eb reached here must be newer than
+ * last committed. Or something seriously wrong happened.
+ */
+ if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ ret = -EUCLEAN;
btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
+ "block=%llu bad generation, have %llu expect > %llu",
+ eb->start, btrfs_header_generation(eb),
+ fs_info->last_trans_committed);
+ goto error;
}
- write_extent_buffer(eb, result, 0, csum_size);
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
+
+error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
}
-static int check_tree_block_fsid(struct extent_buffer *eb)
+/* Checksum all dirty extent buffers in one bio_vec */
+static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
+ struct bio_vec *bvec)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 cur;
+ int ret = 0;
- read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
+ cur += fs_info->nodesize) {
+ struct extent_buffer *eb;
+ bool uptodate;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ eb = find_extent_buffer(fs_info, cur);
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
+ fs_info->nodesize);
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
+ /* A dirty eb shouldn't disappear from buffer_radix */
+ if (WARN_ON(!eb))
+ return -EUCLEAN;
+
+ if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ if (WARN_ON(!uptodate)) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
}
- fs_devices = fs_devices->seed;
+
+ ret = csum_one_extent_buffer(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
}
return ret;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+/*
+ * Checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block.
+ * For subpage extent buffers we need bvec to also read the offset in the page.
+ */
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
{
+ struct page *page = bvec->bv_page;
+ u64 start = page_offset(page);
u64 found_start;
- int found_level;
struct extent_buffer *eb;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int ret = 0;
- u8 result[BTRFS_CSUM_SIZE];
- int reads_done;
- if (!page->private)
- goto out;
+ if (fs_info->nodesize < PAGE_SIZE)
+ return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
+ if (page != eb->pages[0])
+ return 0;
- /* the pending IO might have been the only thing that kept this buffer
- * in memory. Make sure we have a ref for all this other checks
+ found_start = btrfs_header_bytenr(eb);
+
+ if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+ WARN_ON(found_start != 0);
+ return 0;
+ }
+
+ /*
+ * Please do not consolidate these warnings into a single if.
+ * It is useful to know what went wrong.
*/
- atomic_inc(&eb->refs);
+ if (WARN_ON(found_start != start))
+ return -EUCLEAN;
+ if (WARN_ON(!PageUptodate(page)))
+ return -EUCLEAN;
- reads_done = atomic_dec_and_test(&eb->io_pages);
- if (!reads_done)
- goto err;
+ return csum_one_extent_buffer(eb);
+}
- eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
- ret = -EIO;
- goto err;
- }
+static int check_tree_block_fsid(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ u8 fsid[BTRFS_FSID_SIZE];
+ u8 *metadata_uuid;
+
+ read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE);
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
+
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
+}
+
+/* Do basic extent buffer checks at read time */
+static int validate_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u64 found_start;
+ const u32 csum_size = fs_info->csum_size;
+ u8 found_level;
+ u8 result[BTRFS_CSUM_SIZE];
+ const u8 *header_csum;
+ int ret = 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
- eb->start, found_start);
+ btrfs_err_rl(fs_info,
+ "bad tree block start, mirror %u want %llu have %llu",
+ eb->read_mirror, eb->start, found_start);
ret = -EIO;
- goto err;
+ goto out;
}
if (check_tree_block_fsid(eb)) {
- btrfs_err_rl(fs_info, "bad fsid on block %llu",
- eb->start);
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
ret = -EIO;
- goto err;
+ goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "bad tree block level %d on %llu",
- (int)btrfs_header_level(eb), eb->start);
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
- eb, found_level);
-
- ret = csum_tree_block(eb, result);
- if (ret)
- goto err;
-
- if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ csum_tree_block(eb, result);
+ header_csum = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
- read_extent_buffer(eb, &val, 0, csum_size);
+ if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
- fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start, eb->read_mirror,
+ CSUM_FMT_VALUE(csum_size, header_csum),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
- goto err;
+ goto out;
}
/*
@@ -683,79 +520,106 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
set_extent_buffer_uptodate(eb);
else
btrfs_err(fs_info,
- "block=%llu read time tree block corruption detected",
- eb->start);
-err:
- if (reads_done &&
- test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb, ret);
-
- if (ret) {
- /*
- * our io error hook is going to dec the io pages
- * again, we have to make sure it has something
- * to decrement
- */
- atomic_inc(&eb->io_pages);
- clear_extent_buffer_uptodate(eb);
- }
- free_extent_buffer(eb);
+ "read time tree block corruption detected on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
out:
return ret;
}
-static void end_workqueue_bio(struct bio *bio)
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+ int mirror)
{
- struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
- struct btrfs_fs_info *fs_info;
- struct btrfs_workqueue *wq;
-
- fs_info = end_io_wq->info;
- end_io_wq->status = bio->bi_status;
-
- if (bio_op(bio) == REQ_OP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- wq = fs_info->endio_meta_write_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- wq = fs_info->endio_freespace_worker;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else
- wq = fs_info->endio_write_workers;
- } else {
- if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
- wq = fs_info->endio_repair_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct extent_buffer *eb;
+ bool reads_done;
+ int ret = 0;
+
+ /*
+ * We don't allow bio merge for subpage metadata read, so we should
+ * only get one eb for each endio hook.
+ */
+ ASSERT(end == start + fs_info->nodesize - 1);
+ ASSERT(PagePrivate(page));
+
+ eb = find_extent_buffer(fs_info, start);
+ /*
+ * When we are reading one tree block, eb must have been inserted into
+ * the radix tree. If not, something is wrong.
+ */
+ ASSERT(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ /* Subpage read must finish in page read */
+ ASSERT(reads_done);
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
}
+ ret = validate_extent_buffer(eb);
+ if (ret < 0)
+ goto err;
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
- btrfs_queue_work(wq, &end_io_wq->work);
+ set_extent_buffer_uptodate(eb);
+
+ free_extent_buffer(eb);
+ return ret;
+err:
+ /*
+ * end_bio_extent_readpage decrements io_pages in case of error,
+ * make sure it has something to decrement.
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ free_extent_buffer(eb);
+ return ret;
}
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata)
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
- struct btrfs_end_io_wq *end_io_wq;
+ struct extent_buffer *eb;
+ int ret = 0;
+ int reads_done;
- end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
- if (!end_io_wq)
- return BLK_STS_RESOURCE;
+ ASSERT(page->private);
- end_io_wq->private = bio->bi_private;
- end_io_wq->end_io = bio->bi_end_io;
- end_io_wq->info = info;
- end_io_wq->status = 0;
- end_io_wq->bio = bio;
- end_io_wq->metadata = metadata;
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ return validate_subpage_buffer(page, start, end, mirror);
- bio->bi_private = end_io_wq;
- bio->bi_end_io = end_workqueue_bio;
- return 0;
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * The pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ atomic_inc(&eb->refs);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
+err:
+ if (ret) {
+ /*
+ * our io error hook is going to dec the io pages
+ * again, we have to make sure it has something
+ * to decrement
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ }
+ free_extent_buffer(eb);
+
+ return ret;
}
static void run_one_async_start(struct btrfs_work *work)
@@ -764,8 +628,8 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->private_data, async->bio,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->bio,
+ async->dio_file_offset);
if (ret)
async->status = ret;
}
@@ -780,17 +644,14 @@ static void run_one_async_start(struct btrfs_work *work)
*/
static void run_one_async_done(struct btrfs_work *work)
{
- struct async_submit_bio *async;
- struct inode *inode;
- blk_status_t ret;
-
- async = container_of(work, struct async_submit_bio, work);
- inode = async->private_data;
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
- async->bio->bi_status = async->status;
- bio_endio(async->bio);
+ btrfs_bio_end_io(bbio, async->status);
return;
}
@@ -800,11 +661,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
- if (ret) {
- async->bio->bi_status = ret;
- bio_endio(async->bio);
- }
+ btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -815,18 +672,25 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
- extent_submit_bio_start_t *submit_bio_start)
+/*
+ * Submit bio to an async queue.
+ *
+ * Retrun:
+ * - true if the work has been succesfuly submitted
+ * - false in case of error
+ */
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return BLK_STS_RESOURCE;
+ return false;
- async->private_data = private_data;
+ async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -834,15 +698,15 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
- async->bio_offset = bio_offset;
+ async->dio_file_offset = dio_file_offset;
async->status = 0;
if (op_is_sync(bio->bi_opf))
- btrfs_set_work_high_priority(&async->work);
-
- btrfs_queue_work(fs_info->workers, &async->work);
- return 0;
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
+ return true;
}
static blk_status_t btree_csum_one_bio(struct bio *bio)
@@ -855,7 +719,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+ ret = csum_dirty_buffer(root->fs_info, bvec);
if (ret)
break;
}
@@ -863,91 +727,81 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
/*
* when we're called for a write, we're already in the async
- * submission context. Just jump into btrfs_map_bio
+ * submission context. Just jump into btrfs_submit_bio.
*/
return btree_csum_one_bio(bio);
}
-static int check_async_write(struct btrfs_fs_info *fs_info,
+static bool should_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
+ if (btrfs_is_zoned(fs_info))
+ return false;
if (atomic_read(&bi->sync_writers))
- return 0;
+ return false;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
- return 0;
- return 1;
+ return false;
+ return true;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int async = check_async_write(fs_info, BTRFS_I(inode));
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
- if (bio_op(bio) != REQ_OP_WRITE) {
- /*
- * called for a read, do the setup so that checksum validation
- * can happen in the async kernel threads
- */
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_METADATA);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!async) {
- ret = btree_csum_one_bio(bio);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else {
- /*
- * kthread helpers are used to submit writes so that
- * checksumming can happen in parallel across all CPUs
- */
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
- 0, inode, btree_submit_bio_start);
+ bio->bi_opf |= REQ_META;
+
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
}
- if (ret)
- goto out_w_error;
- return 0;
+ /*
+ * Kthread helpers are used to submit writes so that checksumming can
+ * happen in parallel across all CPUs.
+ */
+ if (should_async_write(fs_info, BTRFS_I(inode)) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ return;
-out_w_error:
- bio->bi_status = ret;
- bio_endio(bio);
- return ret;
+ ret = btree_csum_one_bio(bio);
+ if (ret) {
+ btrfs_bio_end_io(bbio, ret);
+ return;
+ }
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
#ifdef CONFIG_MIGRATION
-static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+static int btree_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
* we haven't done the locking hook
*/
- if (PageDirty(page))
+ if (folio_test_dirty(src))
return -EAGAIN;
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_get_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
+#else
+#define btree_migrate_folio NULL
#endif
-
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -970,114 +824,126 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
+static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btree_get_extent, 0);
-}
-
-static int btree_releasepage(struct page *page, gfp_t gfp_flags)
-{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
- return try_release_extent_buffer(page);
+ return try_release_extent_buffer(&folio->page);
}
-static void btree_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btree_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- extent_invalidatepage(tree, page, offset);
- btree_releasepage(page, GFP_NOFS);
- if (PagePrivate(page)) {
- btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
- "page private not zero on page %llu",
- (unsigned long long)page_offset(page));
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
+ tree = &BTRFS_I(folio->mapping->host)->io_tree;
+ extent_invalidate_folio(tree, folio, offset);
+ btree_release_folio(folio, GFP_NOFS);
+ if (folio_get_private(folio)) {
+ btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ "folio private not zero on folio %llu",
+ (unsigned long long)folio_pos(folio));
+ folio_detach_private(folio);
}
}
-static int btree_set_page_dirty(struct page *page)
-{
#ifdef DEBUG
+static bool btree_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
struct extent_buffer *eb;
+ int cur_bit = 0;
+ u64 page_start = folio_pos(folio);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ eb = folio_get_private(folio);
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_write_locked(eb);
+ return filemap_dirty_folio(mapping, folio);
+ }
+ subpage = folio_get_private(folio);
+
+ ASSERT(subpage->dirty_bitmap);
+ while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
+ unsigned long flags;
+ u64 cur;
+ u16 tmp = (1 << cur_bit);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!(tmp & subpage->dirty_bitmap)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur_bit++;
+ continue;
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur = page_start + cur_bit * fs_info->sectorsize;
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
- BUG_ON(!eb);
- BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
-#endif
- return __set_page_dirty_nobuffers(page);
+ eb = find_extent_buffer(fs_info, cur);
+ ASSERT(eb);
+ ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ ASSERT(atomic_read(&eb->refs));
+ btrfs_assert_tree_write_locked(eb);
+ free_extent_buffer(eb);
+
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
+ }
+ return filemap_dirty_folio(mapping, folio);
}
+#else
+#define btree_dirty_folio filemap_dirty_folio
+#endif
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
- .releasepage = btree_releasepage,
- .invalidatepage = btree_invalidatepage,
-#ifdef CONFIG_MIGRATION
- .migratepage = btree_migratepage,
-#endif
- .set_page_dirty = btree_set_page_dirty,
+ .release_folio = btree_release_folio,
+ .invalidate_folio = btree_invalidate_folio,
+ .migrate_folio = btree_migrate_folio,
+ .dirty_folio = btree_dirty_folio,
};
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(buf))
- return;
-
- ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
- if (ret < 0)
- free_extent_buffer_stale(buf);
- else
- free_extent_buffer(buf);
-}
-
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr)
+ u64 bytenr, u64 owner_root,
+ int level)
{
if (btrfs_is_testing(fs_info))
return alloc_test_extent_buffer(fs_info, bytenr);
- return alloc_extent_buffer(fs_info, bytenr);
+ return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
}
/*
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
+ * @owner_root: the objectid of the root owner for this block.
* @parent_transid: expected transid of this tree block, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key in slot 0, skip check if NULL
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
+ ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
+ if (btrfs_check_eb_owner(buf, owner_root)) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
return buf;
}
@@ -1087,61 +953,40 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-buf->len,
fs_info->dirty_metadata_batch);
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
}
-static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
-{
- struct btrfs_subvolume_writers *writers;
- int ret;
-
- writers = kmalloc(sizeof(*writers), GFP_NOFS);
- if (!writers)
- return ERR_PTR(-ENOMEM);
-
- ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
- if (ret < 0) {
- kfree(writers);
- return ERR_PTR(ret);
- }
-
- init_waitqueue_head(&writers->wait);
- return writers;
-}
-
-static void
-btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
-{
- percpu_counter_destroy(&writers->counter);
- kfree(writers);
-}
-
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ memset(&root->root_key, 0, sizeof(root->root_key));
+ memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ root->fs_info = fs_info;
+ root->root_key.objectid = objectid;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
- root->orphan_cleanup_state = 0;
+ RB_CLEAR_NODE(&root->rb_node);
root->last_trans = 0;
- root->highest_objectid = 0;
+ root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
- root->block_rsv = NULL;
+
+ btrfs_init_root_block_rsv(root);
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
@@ -1163,6 +1008,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1173,36 +1019,35 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
- atomic_set(&root->will_be_snapshotted, 0);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
- if (!dummy)
+ root->anon_dev = 0;
+ if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
-
- memset(&root->root_key, 0, sizeof(root->root_key));
- memset(&root->root_item, 0, sizeof(root->root_item));
- memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- if (!dummy)
- root->defrag_trans_start = fs_info->generation;
- else
- root->defrag_trans_start = 0;
- root->root_key.objectid = objectid;
- root->anon_dev = 0;
+ extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE, NULL);
+ }
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&root->leak_list);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ list_add_tail(&root->leak_list, &fs_info->allocated_roots);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+#endif
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- gfp_t flags)
+ u64 objectid, gfp_t flags)
{
struct btrfs_root *root = kzalloc(sizeof(*root), flags);
if (root)
- root->fs_info = fs_info;
+ __setup_root(root, fs_info, objectid);
return root;
}
@@ -1215,18 +1060,113 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
if (!fs_info)
return ERR_PTR(-EINVAL);
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
/* We don't use the stripesize in selftest, set it as sectorsize */
- __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
root->alloc_bytenr = 0;
return root;
}
#endif
+static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
+{
+ const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
+ const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
+}
+
+static int global_root_key_cmp(const void *k, const struct rb_node *node)
+{
+ const struct btrfs_key *key = k;
+ const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
+
+ return btrfs_comp_cpu_keys(key, &root->root_key);
+}
+
+int btrfs_global_root_insert(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct rb_node *tmp;
+
+ write_lock(&fs_info->global_root_lock);
+ tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
+ write_unlock(&fs_info->global_root_lock);
+ ASSERT(!tmp);
+
+ return tmp ? -EEXIST : 0;
+}
+
+void btrfs_global_root_delete(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ write_lock(&fs_info->global_root_lock);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ write_unlock(&fs_info->global_root_lock);
+}
+
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key)
+{
+ struct rb_node *node;
+ struct btrfs_root *root = NULL;
+
+ read_lock(&fs_info->global_root_lock);
+ node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
+ if (node)
+ root = container_of(node, struct btrfs_root, rb_node);
+ read_unlock(&fs_info->global_root_lock);
+
+ return root;
+}
+
+static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ u64 ret;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (bytenr)
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ else
+ block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
+ ASSERT(block_group);
+ if (!block_group)
+ return 0;
+ ret = block_group->global_root_id;
+ btrfs_put_block_group(block_group);
+
+ return ret;
+}
+
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_CSUM_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_EXTENT_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
+ };
+
+ return btrfs_global_root(fs_info, &key);
+}
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1237,28 +1177,27 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_key key;
unsigned int nofs_flag;
int ret = 0;
- uuid_le uuid = NULL_UUID_LE;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail;
+ goto fail_unlock;
}
root->node = leaf;
@@ -1267,8 +1206,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- root->root_item.flags = 0;
- root->root_item.byte_limit = 0;
+ btrfs_set_root_flags(&root->root_item, 0);
+ btrfs_set_root_limit(&root->root_item, 0);
btrfs_set_root_bytenr(&root->root_item, leaf->start);
btrfs_set_root_generation(&root->root_item, trans->transid);
btrfs_set_root_level(&root->root_item, 0);
@@ -1277,9 +1216,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
if (is_fstree(objectid))
- uuid_le_gen(&uuid);
- memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
- root->root_item.drop_level = 0;
+ generate_random_guid(root->root_item.uuid);
+ else
+ export_guid(root->root_item.uuid, &guid_null);
+ btrfs_set_root_drop_level(&root->root_item, 0);
+
+ btrfs_tree_unlock(leaf);
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1288,17 +1230,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- btrfs_tree_unlock(leaf);
-
return root;
-fail:
- if (leaf) {
+fail_unlock:
+ if (leaf)
btrfs_tree_unlock(leaf);
- free_extent_buffer(root->commit_root);
- free_extent_buffer(leaf);
- }
- kfree(root);
+fail:
+ btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1307,39 +1245,44 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct extent_buffer *leaf;
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
+ root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct extent_buffer *leaf;
+
/*
- * DON'T set REF_COWS for log trees
+ * DON'T set SHAREABLE bit for log trees.
*
- * log trees do not get reference counted because they go away
- * before a real commit is actually done. They do store pointers
- * to file data extents, and those reference counts still get
- * updated (along with back refs to the log tree).
+ * Log trees are not exposed to user space thus can't be snapshotted,
+ * and they go away before a real commit is actually done.
+ *
+ * They do store pointers to file data extents, and those reference
+ * counts still get updated (along with back refs to the log tree).
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
- if (IS_ERR(leaf)) {
- kfree(root);
- return ERR_CAST(leaf);
- }
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+ if (IS_ERR(leaf))
+ return PTR_ERR(leaf);
root->node = leaf;
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
- return root;
+
+ return 0;
}
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -1350,6 +1293,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+
+ if (!btrfs_is_zoned(fs_info)) {
+ int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+ }
+
WARN_ON(fs_info->log_root_tree);
fs_info->log_root_tree = log_root;
return 0;
@@ -1361,11 +1314,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log_root;
struct btrfs_inode_item *inode_item;
+ int ret;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+ ret = btrfs_alloc_log_tree_node(trans, log_root);
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+
log_root->last_trans = trans->transid;
log_root->root_key.offset = root->root_key.objectid;
@@ -1387,115 +1347,130 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
+ struct btrfs_path *path,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
- struct btrfs_path *path;
u64 generation;
int ret;
int level;
- path = btrfs_alloc_path();
- if (!path)
+ root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
+ if (!root)
return ERR_PTR(-ENOMEM);
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
- if (!root) {
- ret = -ENOMEM;
- goto alloc_fail;
- }
-
- __setup_root(root, fs_info, key->objectid);
-
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto find_fail;
+ goto fail;
}
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation, level, NULL);
+ key->objectid, generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
- goto find_fail;
- } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ root->node = NULL;
+ goto fail;
+ }
+ if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- free_extent_buffer(root->node);
- goto find_fail;
+ goto fail;
+ }
+
+ /*
+ * For real fs, and not log/reloc trees, root owner must
+ * match its root node owner
+ */
+ if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ root->root_key.objectid != btrfs_header_owner(root->node)) {
+ btrfs_crit(fs_info,
+"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+ root->root_key.objectid, root->node->start,
+ btrfs_header_owner(root->node),
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ goto fail;
}
root->commit_root = btrfs_root_node(root);
-out:
- btrfs_free_path(path);
return root;
-
-find_fail:
- kfree(root);
-alloc_fail:
- root = ERR_PTR(ret);
- goto out;
+fail:
+ btrfs_put_root(root);
+ return ERR_PTR(ret);
}
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
- struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
+ struct btrfs_path *path;
- root = btrfs_read_tree_root(tree_root, location);
- if (IS_ERR(root))
- return root;
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- set_bit(BTRFS_ROOT_REF_COWS, &root->state);
- btrfs_check_and_init_root_item(&root->root_item);
- }
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ root = read_tree_root_path(tree_root, path, key);
+ btrfs_free_path(path);
return root;
}
-int btrfs_init_fs_root(struct btrfs_root *root)
+/*
+ * Initialize subvolume root in-memory structure
+ *
+ * @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ */
+static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
- struct btrfs_subvolume_writers *writers;
+ unsigned int nofs_flag;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
+ /*
+ * We might be called under a transaction (e.g. indirect backref
+ * resolution) which could deadlock if it triggers memory reclaim
+ */
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_drew_lock_init(&root->snapshot_lock);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret)
goto fail;
- }
- writers = btrfs_alloc_subvolume_writers();
- if (IS_ERR(writers)) {
- ret = PTR_ERR(writers);
- goto fail;
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ !btrfs_is_data_reloc_root(root)) {
+ set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
+ btrfs_check_and_init_root_item(&root->root_item);
}
- root->subv_writers = writers;
-
- btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->ino_cache_lock);
- init_waitqueue_head(&root->ino_cache_wait);
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ if (!anon_dev) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ } else {
+ root->anon_dev = anon_dev;
+ }
+ }
mutex_lock(&root->objectid_mutex);
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
goto fail;
}
- ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
mutex_unlock(&root->objectid_mutex);
@@ -1505,18 +1480,56 @@ fail:
return ret;
}
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id)
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
{
struct btrfs_root *root;
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)root_id);
+ if (root)
+ root = btrfs_grab_root(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
+ u64 objectid)
+{
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->tree_root);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->chunk_root);
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->dev_root);
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
+ struct btrfs_root *root = btrfs_global_root(fs_info, &key);
+
+ return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
+ }
+ return NULL;
+}
+
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
@@ -1530,51 +1543,123 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0)
+ if (ret == 0) {
+ btrfs_grab_root(root);
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ }
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
return ret;
}
-struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location,
- bool check_ref)
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_root *root;
+
+ while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
+ root = list_first_entry(&fs_info->allocated_roots,
+ struct btrfs_root, leak_list);
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(&root->root_key, buf),
+ refcount_read(&root->refs));
+ while (refcount_read(&root->refs) > 1)
+ btrfs_put_root(root);
+ btrfs_put_root(root);
+ }
+#endif
+}
+
+static void free_global_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root;
+ struct rb_node *node;
+
+ while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
+ root = rb_entry(node, struct btrfs_root, rb_node);
+ rb_erase(&root->rb_node, &fs_info->global_root_tree);
+ btrfs_put_root(root);
+ }
+}
+
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->ordered_bytes);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
+ btrfs_free_csum_hash(fs_info);
+ btrfs_free_stripe_hash_table(fs_info);
+ btrfs_free_ref_cache(fs_info);
+ kfree(fs_info->balance_ctl);
+ kfree(fs_info->delayed_root);
+ free_global_roots(fs_info);
+ btrfs_put_root(fs_info->tree_root);
+ btrfs_put_root(fs_info->chunk_root);
+ btrfs_put_root(fs_info->dev_root);
+ btrfs_put_root(fs_info->quota_root);
+ btrfs_put_root(fs_info->uuid_root);
+ btrfs_put_root(fs_info->fs_root);
+ btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_put_root(fs_info->block_group_root);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kfree(fs_info->subpage_info);
+ kvfree(fs_info);
+}
+
+
+/*
+ * Get an in-memory reference of a root structure.
+ *
+ * For essential trees like root/extent tree, we grab it from fs_info directly.
+ * For subvolume trees, we check the cached filesystem roots first. If not
+ * found, then read it from disk and add it to cached fs roots.
+ *
+ * Caller should release the root by calling btrfs_put_root() after the usage.
+ *
+ * NOTE: Reloc and log trees can't be read by this function as they share the
+ * same root objectid.
+ *
+ * @objectid: root id
+ * @anon_dev: preallocated anonymous block device number for new roots,
+ * pass 0 for new allocation.
+ * @check_ref: whether to check root item references, If true, return -ENOENT
+ * for orphan roots
+ */
+static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev,
+ bool check_ref)
{
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
- return fs_info->tree_root;
- if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return fs_info->extent_root;
- if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
- return fs_info->chunk_root;
- if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
- return fs_info->dev_root;
- if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
- return fs_info->csum_root;
- if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return fs_info->quota_root ? fs_info->quota_root :
- ERR_PTR(-ENOENT);
- if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
- return fs_info->uuid_root ? fs_info->uuid_root :
- ERR_PTR(-ENOENT);
- if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return fs_info->free_space_root ? fs_info->free_space_root :
- ERR_PTR(-ENOENT);
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
again:
- root = btrfs_lookup_fs_root(fs_info, location->objectid);
+ root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+ /* Shouldn't get preallocated anon_dev for cached roots */
+ ASSERT(!anon_dev);
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_put_root(root);
return ERR_PTR(-ENOENT);
+ }
return root;
}
- root = btrfs_read_fs_root(fs_info->tree_root, location);
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(root))
return root;
@@ -1583,7 +1668,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root);
+ ret = btrfs_init_fs_root(root, anon_dev);
if (ret)
goto fail;
@@ -1594,7 +1679,7 @@ again:
}
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
- key.offset = location->objectid;
+ key.offset = objectid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
btrfs_free_path(path);
@@ -1606,61 +1691,101 @@ again:
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
if (ret == -EEXIST) {
- btrfs_free_fs_root(root);
+ btrfs_put_root(root);
goto again;
}
goto fail;
}
return root;
fail:
- btrfs_free_fs_root(root);
+ /*
+ * If our caller provided us an anonymous device, then it's his
+ * responsibility to free it in case we fail. So we have to set our
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
+ * and once again by our caller.
+ */
+ if (anon_dev)
+ root->anon_dev = 0;
+ btrfs_put_root(root);
return ERR_PTR(ret);
}
-static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+/*
+ * Get in-memory reference of a root structure
+ *
+ * @objectid: tree objectid
+ * @check_ref: if set, verify that the tree exists and the item has at least
+ * one reference
+ */
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, bool check_ref)
{
- struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
- int ret = 0;
- struct btrfs_device *device;
- struct backing_dev_info *bdi;
+ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+}
- rcu_read_lock();
- list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
- bdi = device->bdev->bd_bdi;
- if (bdi_congested(bdi, bdi_bits)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
- return ret;
+/*
+ * Get in-memory reference of a root structure, created as new, optionally pass
+ * the anonymous block device id
+ *
+ * @objectid: tree objectid
+ * @anon_dev: if zero, allocate a new anonymous block device or use the
+ * parameter value
+ */
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev)
+{
+ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions. This is where read checksum verification actually happens
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * @fs_info: the fs_info
+ * @objectid: the objectid we need to lookup
+ *
+ * This is exclusively used for backref walking, and exists specifically because
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
+ * creation time, which means we may have to read the tree_root in order to look
+ * up a fs root that is not in memory. If the root is not in memory we will
+ * read the tree root commit root and look up the fs root from there. This is a
+ * temporary root, it will not be inserted into the radix tree as it doesn't
+ * have the most uptodate information, it'll simply be discarded once the
+ * backref code is finished using the root.
*/
-static void end_workqueue_fn(struct btrfs_work *work)
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid)
{
- struct bio *bio;
- struct btrfs_end_io_wq *end_io_wq;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+
+ ASSERT(path->search_commit_root && path->skip_locking);
+
+ /*
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
+ * since this is called via the backref walking code we won't be looking
+ * up a root that doesn't exist, unless there's corruption. So if root
+ * != NULL just return it.
+ */
+ root = btrfs_get_global_root(fs_info, objectid);
+ if (root)
+ return root;
+
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root)
+ return root;
- end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
- bio = end_io_wq->bio;
+ key.objectid = objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
+ btrfs_release_path(path);
- bio->bi_status = end_io_wq->status;
- bio->bi_private = end_io_wq->private;
- bio->bi_end_io = end_io_wq->end_io;
- bio_endio(bio);
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
+ return root;
}
static int cleaner_kthread(void *arg)
{
- struct btrfs_root *root = arg;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = arg;
int again;
while (1) {
@@ -1693,7 +1818,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(fs_info);
- again = btrfs_clean_one_deleted_snapshot(root);
+ again = btrfs_clean_one_deleted_snapshot(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
/*
@@ -1703,16 +1828,23 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(fs_info);
/*
- * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
- * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * after acquiring fs_info->reclaim_bgs_lock. So we
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * Reclaim block groups in the reclaim_bgs list after we deleted
+ * all unused block_groups. This possibly gives us some more free
+ * space.
+ */
+ btrfs_reclaim_bgs(fs_info);
sleep:
- clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
+ clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
kthread_parkme();
if (kthread_should_stop())
@@ -1732,13 +1864,13 @@ static int transaction_kthread(void *arg)
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
u64 transid;
- time64_t now;
+ time64_t delta;
unsigned long delay;
bool cannot_commit;
do {
cannot_commit = false;
- delay = HZ * fs_info->commit_interval;
+ delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1748,13 +1880,14 @@ static int transaction_kthread(void *arg)
goto sleep;
}
- now = ktime_get_seconds();
- if (cur->state < TRANS_STATE_COMMIT_START &&
- !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
- (now < cur->start_time ||
- now - cur->start_time < fs_info->commit_interval)) {
+ delta = ktime_get_seconds() - cur->start_time;
+ if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
+ cur->state < TRANS_STATE_COMMIT_START &&
+ delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay = HZ * 5;
+ delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay = min(delay,
+ msecs_to_jiffies(fs_info->commit_interval * 1000));
goto sleep;
}
transid = cur->transid;
@@ -1776,8 +1909,7 @@ sleep:
wake_up_process(fs_info->cleaner_kthread);
mutex_unlock(&fs_info->transaction_kthread_mutex);
- if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
- &fs_info->fs_state)))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_cleanup_transaction(fs_info);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
@@ -1846,11 +1978,23 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
- btrfs_set_backup_extent_root_gen(root_backup,
- btrfs_header_generation(info->extent_root->node));
- btrfs_set_backup_extent_root_level(root_backup,
- btrfs_header_level(info->extent_root->node));
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
+
+ btrfs_set_backup_extent_root(root_backup,
+ extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(extent_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(csum_root->node));
+ }
/*
* we might commit during log recovery, which happens before we set
@@ -1871,12 +2015,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
- btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
- btrfs_set_backup_csum_root_gen(root_backup,
- btrfs_header_generation(info->csum_root->node));
- btrfs_set_backup_csum_root_level(root_backup,
- btrfs_header_level(info->csum_root->node));
-
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
@@ -1942,16 +2080,20 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
- btrfs_destroy_workqueue(fs_info->endio_workers);
- btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- btrfs_destroy_workqueue(fs_info->endio_repair_workers);
- btrfs_destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->rmw_workers)
+ destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
- btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers)
@@ -1961,8 +2103,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
- btrfs_destroy_workqueue(fs_info->endio_meta_workers);
- btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -1975,19 +2117,51 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
}
+static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root, *tmp;
+
+ rbtree_postorder_for_each_entry_safe(root, tmp,
+ &fs_info->global_root_tree,
+ rb_node)
+ free_root_extent_buffers(root);
+}
+
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
+ free_global_root_pointers(info);
free_root_extent_buffers(info->dev_root);
- free_root_extent_buffers(info->extent_root);
- free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
+ free_root_extent_buffers(info->fs_root);
+ free_root_extent_buffers(info->data_reloc_root);
+ free_root_extent_buffers(info->block_group_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
- free_root_extent_buffers(info->free_space_root);
+}
+
+void btrfs_put_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+
+ if (refcount_dec_and_test(&root->refs)) {
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
+ btrfs_drew_lock_destroy(&root->snapshot_lock);
+ free_root_extent_buffers(root);
+#ifdef CONFIG_BTRFS_DEBUG
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ list_del_init(&root->leak_list);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+#endif
+ kfree(root);
+ }
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2001,13 +2175,9 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
- } else {
- free_extent_buffer(gang[0]->node);
- free_extent_buffer(gang[0]->commit_root);
- btrfs_put_fs_root(gang[0]);
- }
+ btrfs_put_root(gang[0]);
}
while (1) {
@@ -2019,11 +2189,6 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++)
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- btrfs_free_log_root_tree(NULL, fs_info);
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
- }
}
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
@@ -2045,11 +2210,14 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
}
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2063,16 +2231,15 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = false;
+ IO_TREE_BTREE_INODE_IO, NULL);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
- BTRFS_I(inode)->root = fs_info->tree_root;
- memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
+ BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
+ BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
+ BTRFS_I(inode)->location.type = 0;
+ BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- btrfs_insert_inode_hash(inode);
+ __insert_inode_hash(inode, hash);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2091,17 +2258,19 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
mutex_init(&fs_info->qgroup_rescan_lock);
}
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices)
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue(fs_info, "worker",
+ btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+ fs_info->hipri_workers =
+ btrfs_alloc_workqueue(fs_info, "worker-high",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
@@ -2118,52 +2287,37 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
fs_info->endio_workers =
- btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+ alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
- max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
- max_active, 2);
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
- max_active, 4);
- fs_info->endio_repair_workers =
- btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
+ fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
fs_info->delayed_workers =
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0);
- fs_info->readahead_workers =
- btrfs_alloc_workqueue(fs_info, "readahead", flags,
- max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->flush_workers &&
+ if (!(fs_info->workers && fs_info->hipri_workers &&
+ fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
- fs_info->endio_repair_workers &&
+ fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->readahead_workers &&
- fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->qgroup_rescan_workers &&
+ fs_info->caching_workers && fs_info->fixup_workers &&
+ fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@@ -2186,14 +2340,12 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
return 0;
}
-static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
-{
- crypto_free_shash(fs_info->csum_shash);
-}
-
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
@@ -2208,33 +2360,34 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return -EIO;
}
- log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
+ GFP_KERNEL);
if (!log_tree_root)
return -ENOMEM;
- __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1,
- level, NULL);
+ BTRFS_TREE_LOG_OBJECTID,
+ fs_info->generation + 1, level,
+ NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
- kfree(log_tree_root);
+ log_tree_root->node = NULL;
+ btrfs_put_root(log_tree_root);
return ret;
- } else if (!extent_buffer_uptodate(log_tree_root->node)) {
+ }
+ if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ btrfs_put_root(log_tree_root);
return -EIO;
}
+
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ btrfs_put_root(log_tree_root);
return ret;
}
@@ -2247,6 +2400,115 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return 0;
}
+static int load_global_roots_objectid(struct btrfs_root *tree_root,
+ struct btrfs_path *path, u64 objectid,
+ const char *name)
+{
+ struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_root *root;
+ u64 max_global_id = 0;
+ int ret;
+ struct btrfs_key key = {
+ .objectid = objectid,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ bool found = false;
+
+ /* If we have IGNOREDATACSUMS skip loading these roots. */
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
+ btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ return 0;
+ }
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ break;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+ ret = 0;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != objectid)
+ break;
+ btrfs_release_path(path);
+
+ /*
+ * Just worry about this for extent tree, it'll be the same for
+ * everybody.
+ */
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ max_global_id = max(max_global_id, key.offset);
+
+ found = true;
+ root = read_tree_root_path(tree_root, path, &key);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = PTR_ERR(root);
+ break;
+ }
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ ret = btrfs_global_root_insert(root);
+ if (ret) {
+ btrfs_put_root(root);
+ break;
+ }
+ key.offset++;
+ }
+ btrfs_release_path(path);
+
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ fs_info->nr_global_roots = max_global_id + 1;
+
+ if (!found || ret) {
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
+ set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ ret = ret ? ret : -ENOENT;
+ else
+ ret = 0;
+ btrfs_err(fs_info, "failed to load root %s", name);
+ }
+ return ret;
+}
+
+static int load_global_roots(struct btrfs_root *tree_root)
+{
+ struct btrfs_path *path;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_EXTENT_TREE_OBJECTID, "extent");
+ if (ret)
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_CSUM_TREE_OBJECTID, "csum");
+ if (ret)
+ goto out;
+ if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
+ goto out;
+ ret = load_global_roots_objectid(tree_root, path,
+ BTRFS_FREE_SPACE_TREE_OBJECTID,
+ "free space");
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -2256,36 +2518,58 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
BUG_ON(!fs_info->tree_root);
- location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ ret = load_global_roots(tree_root);
+ if (ret)
+ return ret;
+
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
+ /* Initialize fs_info for all devices in any case */
+ ret = btrfs_init_devices_late(fs_info);
+ if (ret)
+ goto out;
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
+ /*
+ * This tree can share blocks with some other fs tree during relocation
+ * and we need a proper setup by btrfs_get_fs_root
+ */
+ root = btrfs_get_fs_root(tree_root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID, true);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
@@ -2298,23 +2582,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_UUID_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- if (ret != -ENOENT)
- goto out;
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->uuid_root = root;
- }
-
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
- goto out;
+ if (ret != -ENOENT)
+ goto out;
}
+ } else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
+ fs_info->uuid_root = root;
}
return 0;
@@ -2334,8 +2609,8 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-static int validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2375,13 +2650,22 @@ static int validate_super(struct btrfs_fs_info *fs_info,
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
- /* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_SIZE) {
+
+ /*
+ * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ *
+ * We can support 16K sectorsize with 64K page size without problem,
+ * but such sectorsize/pagesize combination doesn't make much sense.
+ * 4K will be our future standard, PAGE_SIZE is supported from the very
+ * beginning.
+ */
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
btrfs_err(fs_info,
- "sectorsize %llu not supported yet, only support %lu",
+ "sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
ret = -EINVAL;
}
+
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
@@ -2410,6 +2694,36 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+ BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+ "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
+ fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
+ memcmp(fs_info->fs_devices->metadata_uuid,
+ fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
+ fs_info->super_copy->metadata_uuid,
+ fs_info->fs_devices->metadata_uuid);
+ ret = -EINVAL;
+ }
+
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2492,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
- return validate_super(fs_info, fs_info->super_copy, 0);
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}
/*
@@ -2506,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = validate_super(fs_info, sb, -1);
+ ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -2530,6 +2844,46 @@ out:
return ret;
}
+static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
+{
+ int ret = 0;
+
+ root->node = read_tree_block(root->fs_info, bytenr,
+ root->root_key.objectid, gen, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ }
+ if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ root->node = NULL;
+ return -EIO;
+ }
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ root->commit_root = btrfs_root_node(root);
+ btrfs_set_root_refs(&root->root_item, 1);
+ return ret;
+}
+
+static int load_important_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 gen, bytenr;
+ int level, ret;
+
+ bytenr = btrfs_super_root(sb);
+ gen = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read tree root");
+ return ret;
+ }
+ return 0;
+}
+
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
int backup_index = find_newest_super_backup(fs_info);
@@ -2540,9 +2894,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int i;
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
- u64 generation;
- int level;
-
if (handle_error) {
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
@@ -2567,39 +2918,24 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
}
- generation = btrfs_super_generation(sb);
- level = btrfs_super_root_level(sb);
- tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
- generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
- handle_error = true;
- if (IS_ERR(tree_root->node))
- ret = PTR_ERR(tree_root->node);
- else if (!extent_buffer_uptodate(tree_root->node))
- ret = -EUCLEAN;
-
- btrfs_warn(fs_info, "failed to read tree root");
+ ret = load_important_roots(fs_info);
+ if (ret) {
+ handle_error = true;
continue;
}
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
/*
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
*/
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(tree_root);
if (ret < 0) {
handle_error = true;
continue;
}
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
ret = btrfs_read_roots(fs_info);
if (ret < 0) {
@@ -2608,8 +2944,9 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
}
/* All successful */
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ fs_info->generation = btrfs_header_generation(tree_root->node);
+ fs_info->last_trans_committed = fs_info->generation;
+ fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
if (backup_index < 0) {
@@ -2624,67 +2961,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret;
}
-int __cold open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- u32 sectorsize;
- u32 nodesize;
- u32 stripesize;
- u64 generation;
- u64 features;
- u16 csum_type;
- struct btrfs_key location;
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *tree_root;
- struct btrfs_root *chunk_root;
- int ret;
- int err = -EINVAL;
- int clear_free_space_tree = 0;
- int level;
-
- tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
- chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
- if (!tree_root || !chunk_root) {
- err = -ENOMEM;
- goto fail;
- }
-
- ret = init_srcu_struct(&fs_info->subvol_srcu);
- if (ret) {
- err = ret;
- goto fail;
- }
-
- ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_dio_bytes;
- }
- fs_info->dirty_metadata_batch = PAGE_SIZE *
- (1 + ilog2(nr_cpu_ids));
-
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_dirty_metadata_bytes;
- }
-
- ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
- GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_delalloc_bytes;
- }
-
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2700,17 +2978,43 @@ int __cold open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ spin_lock_init(&fs_info->treelog_bg_lock);
+ spin_lock_init(&fs_info->zone_active_bgs_lock);
+ spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
+ rwlock_init(&fs_info->global_root_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
- mutex_init(&fs_info->delete_unused_bgs_mutex);
+ mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
+ INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->zone_active_bgs);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&fs_info->allocated_roots);
+ INIT_LIST_HEAD(&fs_info->allocated_ebs);
+ spin_lock_init(&fs_info->eb_leak_lock);
+#endif
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2724,10 +3028,9 @@ int __cold open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
- atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
- fs_info->sb = sb;
+ fs_info->global_root_tree = RB_ROOT;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
@@ -2735,9 +3038,6 @@ int __cold open_ctree(struct super_block *sb,
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
- /* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2746,43 +3046,18 @@ int __cold open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->ordered_roots);
spin_lock_init(&fs_info->ordered_root_lock);
- fs_info->btree_inode = new_inode(sb);
- if (!fs_info->btree_inode) {
- err = -ENOMEM;
- goto fail_bio_counter;
- }
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
- fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
- GFP_KERNEL);
- if (!fs_info->delayed_root) {
- err = -ENOMEM;
- goto fail_iput;
- }
- btrfs_init_delayed_root(fs_info->delayed_root);
-
btrfs_init_scrub(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
-
- sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
- sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
-
- btrfs_init_btree_inode(fs_info);
+ btrfs_init_async_reclaim_work(fs_info);
- spin_lock_init(&fs_info->block_group_cache_lock);
- fs_info->block_group_cache_tree = RB_ROOT;
- fs_info->first_logical_byte = (u64)-1;
+ rwlock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT_CACHED;
- extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
- IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
- IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
- fs_info->pinned_extents = &fs_info->freed_extents[0];
- set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
+ extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -2811,48 +3086,392 @@ int __cold open_ctree(struct super_block *sb,
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
fs_info->sectorsize = 4096;
+ fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
+
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
- fs_info->send_in_progress = 0;
+ fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
+ INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
+}
+
+static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ int ret;
+
+ fs_info->sb = sb;
+ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
+ sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
+
+ ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_KERNEL);
+ if (!fs_info->delayed_root)
+ return -ENOMEM;
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ if (sb_rdonly(sb))
+ set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+
+ return btrfs_alloc_stripe_hash_table(fs_info);
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ int ret;
+
+ /*
+ * 1st step is to iterate through the existing UUID tree and
+ * to delete all entries that contain outdated data.
+ * 2nd step is to add all missing entries to the UUID tree.
+ */
+ ret = btrfs_uuid_tree_iterate(fs_info);
+ if (ret < 0) {
+ if (ret != -EINTR)
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d",
+ ret);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return ret;
+ }
+ return btrfs_uuid_scan_kthread(data);
+}
+
+static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
+
+/*
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount
+ * code paths.
+ */
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
+{
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+}
+
+/*
+ * Mounting logic specific to read-write file systems. Shared by open_ctree
+ * and btrfs_remount when remounting from read-only to read-write.
+ */
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+ bool clear_free_space_tree = false;
+
+ if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ clear_free_space_tree = true;
+ } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
+ btrfs_warn(fs_info, "free space tree is invalid");
+ clear_free_space_tree = true;
+ }
+
+ if (clear_free_space_tree) {
+ btrfs_info(fs_info, "clearing free space tree");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to clear free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ /*
+ * btrfs_find_orphan_roots() is responsible for finding all the dead
+ * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
+ * them into the fs_info->fs_roots_radix tree. This must be done before
+ * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
+ * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
+ * item before the root's tree is deleted - this means that if we unmount
+ * or crash before the deletion completes, on the next mount we will not
+ * delete what remains of the tree because the orphan item does not
+ * exists anymore, which is what tells us we have a pending deletion.
+ */
+ ret = btrfs_find_orphan_roots(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto out;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ goto out;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(fs_info);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
+ goto out;
+ }
+
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info, "creating free space tree");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
+ ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to resume dev_replace");
+ goto out;
+ }
+
+ btrfs_qgroup_rescan_resume(fs_info);
+
+ if (!fs_info->uuid_root) {
+ btrfs_info(fs_info, "creating UUID tree");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree %d", ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ u16 csum_type;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ int ret;
+ int err = -EINVAL;
+ int level;
- ret = btrfs_alloc_stripe_hash_table(fs_info);
+ ret = init_mount_fs_info(fs_info, sb);
if (ret) {
err = ret;
- goto fail_alloc;
+ goto fail;
+ }
+
+ /* These need to be init'ed before we start creating inodes and such. */
+ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->tree_root = tree_root;
+ chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->chunk_root = chunk_root;
+ if (!tree_root || !chunk_root) {
+ err = -ENOMEM;
+ goto fail;
}
- __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+ btrfs_init_btree_inode(fs_info);
- invalidate_bdev(fs_devices->latest_bdev);
+ invalidate_bdev(fs_devices->latest_dev->bdev);
/*
* Read super block and check the signature bytes only
*/
- bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (IS_ERR(bh)) {
- err = PTR_ERR(bh);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ if (IS_ERR(disk_super)) {
+ err = PTR_ERR(disk_super);
goto fail_alloc;
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
- csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
+ csum_type = btrfs_super_csum_type(disk_super);
if (!btrfs_supported_super_csum(csum_type)) {
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
csum_type);
err = -EINVAL;
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
+ fs_info->csum_size = btrfs_super_csum_size(disk_super);
+
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
+ btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
@@ -2860,11 +3479,11 @@ int __cold open_ctree(struct super_block *sb,
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, bh->b_data)) {
+ if (btrfs_check_super_csum(fs_info, disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
- brelse(bh);
- goto fail_csum;
+ btrfs_release_disk_super(disk_super);
+ goto fail_alloc;
}
/*
@@ -2872,19 +3491,11 @@ int __cold open_ctree(struct super_block *sb,
* following bytes up to INFO_SIZE, the checksum is calculated from
* the whole block of INFO_SIZE
*/
- memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
- brelse(bh);
+ memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
+ btrfs_release_disk_super(disk_super);
disk_super = fs_info->super_copy;
- ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE));
-
- if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
- ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid,
- BTRFS_FSID_SIZE));
- }
features = btrfs_super_flags(disk_super);
if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
@@ -2901,11 +3512,11 @@ int __cold open_ctree(struct super_block *sb,
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
if (!btrfs_super_root(disk_super))
- goto fail_csum;
+ goto fail_alloc;
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@@ -2917,92 +3528,62 @@ int __cold open_ctree(struct super_block *sb,
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret) {
- err = ret;
- goto fail_csum;
- }
-
- features = btrfs_super_incompat_flags(disk_super) &
- ~BTRFS_FEATURE_INCOMPAT_SUPP;
- if (features) {
- btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
- features);
- err = -EINVAL;
- goto fail_csum;
- }
-
- features = btrfs_super_incompat_flags(disk_super);
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
- if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- btrfs_info(fs_info, "has skinny extents");
-
- /*
- * flag our filesystem as having big metadata blocks if
- * they are bigger than the page size
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
+ /* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = sectorsize;
fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
- /* Cache block sizes */
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- /*
- * mixed block groups end up with duplicate but slightly offset
- * extent buffers for the same range. It leads to corruptions
- */
- if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != nodesize)) {
- btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
- nodesize, sectorsize);
- goto fail_csum;
+ ret = btrfs_parse_options(fs_info, options, sb->s_flags);
+ if (ret) {
+ err = ret;
+ goto fail_alloc;
}
- /*
- * Needn't use the lock because there is no other task which will
- * update the flag.
- */
- btrfs_set_super_incompat_flags(disk_super, features);
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0) {
+ err = ret;
+ goto fail_alloc;
+ }
- features = btrfs_super_compat_ro_flags(disk_super) &
- ~BTRFS_FEATURE_COMPAT_RO_SUPP;
- if (!sb_rdonly(sb) && features) {
- btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
- features);
- err = -EINVAL;
- goto fail_csum;
+ if (sectorsize < PAGE_SIZE) {
+ struct btrfs_subpage_info *subpage_info;
+
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
+ btrfs_warn(fs_info,
+ "read-write for sector size %u with page size %lu is experimental",
+ sectorsize, PAGE_SIZE);
+ subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+ if (!subpage_info)
+ goto fail_alloc;
+ btrfs_init_subpage_info(subpage_info, sectorsize);
+ fs_info->subpage_info = subpage_info;
}
- ret = btrfs_init_workqueues(fs_info, fs_devices);
+ ret = btrfs_init_workqueues(fs_info);
if (ret) {
err = ret;
goto fail_sb_buffer;
}
- sb->s_bdi->congested_fn = btrfs_congested_fn;
- sb->s_bdi->congested_data = fs_info;
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3020,25 +3601,16 @@ int __cold open_ctree(struct super_block *sb,
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
-
- __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
-
- chunk_root->node = read_tree_block(fs_info,
- btrfs_super_chunk_root(disk_super),
- generation, level, NULL);
- if (IS_ERR(chunk_root->node) ||
- !extent_buffer_uptodate(chunk_root->node)) {
+ ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
+ generation, level);
+ if (ret) {
btrfs_err(fs_info, "failed to read chunk root");
- if (!IS_ERR(chunk_root->node))
- free_extent_buffer(chunk_root->node);
- chunk_root->node = NULL;
goto fail_tree_roots;
}
- btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
- chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
+ offsetof(struct btrfs_header, chunk_tree_uuid),
+ BTRFS_UUID_SIZE);
ret = btrfs_read_chunk_tree(fs_info);
if (ret) {
@@ -3047,12 +3619,14 @@ int __cold open_ctree(struct super_block *sb,
}
/*
- * Keep the devid that is marked to be the target device for the
- * device replace procedure
+ * At this point we know all the devices that make this filesystem,
+ * including the seed devices but we don't know yet if the replace
+ * target is required. So free devices that are not part of this
+ * filesystem but skip the replace target device which is checked
+ * below in btrfs_init_dev_replace().
*/
- btrfs_free_extra_devids(fs_devices, 0);
-
- if (!fs_devices->latest_bdev) {
+ btrfs_free_extra_devids(fs_devices);
+ if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
@@ -3061,6 +3635,31 @@ int __cold open_ctree(struct super_block *sb,
if (ret)
goto fail_tree_roots;
+ /*
+ * Get zone type information of zoned block devices. This will also
+ * handle emulation of a zoned filesystem if a regular device has the
+ * zoned incompat feature flag set.
+ */
+ ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to read device zone info: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ /*
+ * If we have a uuid root and we're not being told to rescan we need to
+ * check the generation here so we can set the
+ * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
+ * transaction during a balance or the log replay without updating the
+ * uuid generation, and then if we crash we would rescan the uuid tree,
+ * even though it was perfectly fine.
+ */
+ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
+ fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+
ret = btrfs_verify_dev_extents(fs_info);
if (ret) {
btrfs_err(fs_info,
@@ -3086,7 +3685,12 @@ int __cold open_ctree(struct super_block *sb,
goto fail_block_groups;
}
- btrfs_free_extra_devids(fs_devices, 1);
+ ret = btrfs_check_zoned_mode(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to initialize zoned mode: %d",
+ ret);
+ goto fail_block_groups;
+ }
ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
@@ -3113,13 +3717,16 @@ int __cold open_ctree(struct super_block *sb,
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_free_zone_cache(fs_info);
+
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
}
- fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
goto fail_sysfs;
@@ -3145,8 +3752,7 @@ int __cold open_ctree(struct super_block *sb,
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
ret = btrfsic_mount(fs_info, fs_devices,
btrfs_test_opt(fs_info,
- CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
- 1 : 0,
+ CHECK_INTEGRITY_DATA) ? 1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
btrfs_warn(fs_info,
@@ -3172,31 +3778,7 @@ int __cold open_ctree(struct super_block *sb,
}
}
- ret = btrfs_find_orphan_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- if (!sb_rdonly(sb)) {
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0) {
- btrfs_warn(fs_info, "failed to recover relocation: %d",
- ret);
- err = -EINVAL;
- goto fail_qgroup;
- }
- }
-
- location.objectid = BTRFS_FS_TREE_OBJECTID;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = 0;
-
- fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
btrfs_warn(fs_info, "failed to read fs tree: %d", err);
@@ -3205,78 +3787,18 @@ int __cold open_ctree(struct super_block *sb,
}
if (sb_rdonly(sb))
- return 0;
-
- if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
- btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- clear_free_space_tree = 1;
- } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
- btrfs_warn(fs_info, "free space tree is invalid");
- clear_free_space_tree = 1;
- }
+ goto clear_oneshot;
- if (clear_free_space_tree) {
- btrfs_info(fs_info, "clearing free space tree");
- ret = btrfs_clear_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to clear free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
-
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_info(fs_info, "creating free space tree");
- ret = btrfs_create_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
-
- down_read(&fs_info->cleanup_work_sem);
- if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
- (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
- up_read(&fs_info->cleanup_work_sem);
- close_ctree(fs_info);
- return ret;
- }
- up_read(&fs_info->cleanup_work_sem);
-
- ret = btrfs_resume_balance_async(fs_info);
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
- btrfs_warn(fs_info, "failed to resume balance: %d", ret);
close_ctree(fs_info);
return ret;
}
-
- ret = btrfs_resume_dev_replace_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
-
- btrfs_qgroup_rescan_resume(fs_info);
btrfs_discard_resume(fs_info);
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
- fs_info->generation !=
- btrfs_super_uuid_tree_generation(disk_super)) {
+ if (fs_info->uuid_root &&
+ (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
+ fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
@@ -3285,17 +3807,16 @@ int __cold open_ctree(struct super_block *sb,
close_ctree(fs_info);
return ret;
}
- } else {
- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
}
+
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- /*
- * backuproot only affect mount behavior, and if open_ctree succeeded,
- * no need to keep the flag
- */
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ /* Kick the cleaner thread so it'll start deleting snapshots. */
+ if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
+clear_oneshot:
+ btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3323,96 +3844,110 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
-fail_csum:
- btrfs_free_csum_hash(fs_info);
fail_alloc:
-fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
-fail_bio_counter:
- percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
-fail_delalloc_bytes:
- percpu_counter_destroy(&fs_info->delalloc_bytes);
-fail_dirty_metadata_bytes:
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_dio_bytes:
- percpu_counter_destroy(&fs_info->dio_bytes);
-fail_srcu:
- cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+static void btrfs_end_super_write(struct bio *bio)
{
- if (uptodate) {
- set_buffer_uptodate(bh);
- } else {
- struct btrfs_device *device = (struct btrfs_device *)
- bh->b_private;
-
- btrfs_warn_rl_in_rcu(device->fs_info,
- "lost page write due to IO error on %s",
- rcu_str_deref(device->name));
- /* note, we don't set_buffer_write_io_error because we have
- * our own ways of dealing with the IO errors
- */
- clear_buffer_uptodate(bh);
- btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+ struct btrfs_device *device = bio->bi_private;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ struct page *page;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ page = bvec->bv_page;
+
+ if (bio->bi_status) {
+ btrfs_warn_rl_in_rcu(device->fs_info,
+ "lost page write due to IO error on %s (%d)",
+ rcu_str_deref(device->name),
+ blk_status_to_errno(bio->bi_status));
+ ClearPageUptodate(page);
+ SetPageError(page);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ } else {
+ SetPageUptodate(page);
+ }
+
+ put_page(page);
+ unlock_page(page);
}
- unlock_buffer(bh);
- put_bh(bh);
+
+ bio_put(bio);
}
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
- struct buffer_head **bh_ret)
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num, bool drop_cache)
{
- struct buffer_head *bh;
struct btrfs_super_block *super;
- u64 bytenr;
+ struct page *page;
+ u64 bytenr, bytenr_orig;
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ int ret;
- bytenr = btrfs_sb_offset(copy_num);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
- return -EINVAL;
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret == -ENOENT)
+ return ERR_PTR(-EINVAL);
+ else if (ret)
+ return ERR_PTR(ret);
- bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
- /*
- * If we fail to read from the underlying devices, as of now
- * the best option we have is to mark it EIO.
- */
- if (!bh)
- return -EIO;
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
+ return ERR_PTR(-EINVAL);
- super = (struct btrfs_super_block *)bh->b_data;
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- return -EINVAL;
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
}
- *bh_ret = bh;
- return 0;
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr_orig) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return super;
}
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
{
- struct buffer_head *bh;
- struct buffer_head *latest = NULL;
- struct btrfs_super_block *super;
+ struct btrfs_super_block *super, *latest = NULL;
int i;
u64 transid = 0;
- int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3420,48 +3955,42 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- ret = btrfs_read_dev_one_super(bdev, i, &bh);
- if (ret)
+ super = btrfs_read_dev_one_super(bdev, i, false);
+ if (IS_ERR(super))
continue;
- super = (struct btrfs_super_block *)bh->b_data;
-
if (!latest || btrfs_super_generation(super) > transid) {
- brelse(latest);
- latest = bh;
+ if (latest)
+ btrfs_release_disk_super(super);
+
+ latest = super;
transid = btrfs_super_generation(super);
- } else {
- brelse(bh);
}
}
- if (!latest)
- return ERR_PTR(ret);
-
- return latest;
+ return super;
}
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
- * buffer heads we write are pinned.
+ * pages we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
- * Return number of errors when buffer head is not found or submission fails.
+ * Return number of errors when page is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
+ struct address_space *mapping = device->bdev->bd_inode->i_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- struct buffer_head *bh;
int i;
- int ret;
int errors = 0;
- u64 bytenr;
- int op_flags;
+ int ret;
+ u64 bytenr, bytenr_orig;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3469,48 +3998,73 @@ static int write_dev_supers(struct btrfs_device *device,
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
- bytenr = btrfs_sb_offset(i);
+ struct page *page;
+ struct bio *bio;
+ struct btrfs_super_block *disk_super;
+
+ bytenr_orig = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ btrfs_err(device->fs_info,
+ "couldn't get super block location for mirror %d",
+ i);
+ errors++;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- btrfs_set_super_bytenr(sb, bytenr);
+ btrfs_set_super_bytenr(sb, bytenr_orig);
- crypto_shash_init(shash);
- crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- crypto_shash_final(shash, sb->csum);
+ crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
+ sb->csum);
- /* One reference for us, and we leave it for the caller */
- bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh) {
+ page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
+ GFP_NOFS);
+ if (!page) {
btrfs_err(device->fs_info,
- "couldn't get super buffer head for bytenr %llu",
+ "couldn't get super block page for bytenr %llu",
bytenr);
errors++;
continue;
}
- memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+ /* Bump the refcount for wait_dev_supers() */
+ get_page(page);
- /* one reference for submit_bh */
- get_bh(bh);
+ disk_super = page_address(page);
+ memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
- set_buffer_uptodate(bh);
- lock_buffer(bh);
- bh->b_end_io = btrfs_end_buffer_write_sync;
- bh->b_private = device;
+ /*
+ * Directly use bios here instead of relying on the page cache
+ * to do I/O, so we don't lose the ability to do integrity
+ * checking.
+ */
+ bio = bio_alloc(device->bdev, 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOFS);
+ bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+ bio->bi_private = device;
+ bio->bi_end_io = btrfs_end_super_write;
+ __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+ offset_in_page(bytenr));
/*
- * we fua the first super. The others we allow
- * to go down lazy.
+ * We FUA only the first super block. The others we allow to
+ * go down lazy and there's a short window where the on-disk
+ * copies might still contain the older version.
*/
- op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
- op_flags |= REQ_FUA;
- ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
- if (ret)
+ bio->bi_opf |= REQ_FUA;
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
+
+ if (btrfs_advance_sb_log(device, i))
errors++;
}
return errors < i ? 0 : -1;
@@ -3520,47 +4074,57 @@ static int write_dev_supers(struct btrfs_device *device,
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
- * Return number of errors when buffer head is not found or not marked up to
+ * Return number of errors when page is not found or not marked up to
* date.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
- struct buffer_head *bh;
int i;
int errors = 0;
bool primary_failed = false;
+ int ret;
u64 bytenr;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
- bytenr = btrfs_sb_offset(i);
+ struct page *page;
+
+ ret = btrfs_sb_log_location(device, i, READ, &bytenr);
+ if (ret == -ENOENT) {
+ break;
+ } else if (ret < 0) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- bh = __find_get_block(device->bdev,
- bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh) {
+ page = find_get_page(device->bdev->bd_inode->i_mapping,
+ bytenr >> PAGE_SHIFT);
+ if (!page) {
errors++;
if (i == 0)
primary_failed = true;
continue;
}
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
+ /* Page is submitted locked and unlocked once the IO completes */
+ wait_on_page_locked(page);
+ if (PageError(page)) {
errors++;
if (i == 0)
primary_failed = true;
}
- /* drop our reference */
- brelse(bh);
+ /* Drop our reference */
+ put_page(page);
- /* drop the reference from the writing run */
- brelse(bh);
+ /* Drop the reference from the writing run */
+ put_page(page);
}
/* log error, force error return */
@@ -3579,6 +4143,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
+ bio_uninit(bio);
complete(bio->bi_private);
}
@@ -3588,20 +4153,31 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ if (!bdev_write_cache(device->bdev))
return;
+#endif
- bio_reset(bio);
+ bio_init(bio, device->bdev, NULL, 0,
+ REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
- bio_set_dev(bio, device->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -3610,7 +4186,7 @@ static void write_dev_flush(struct btrfs_device *device)
*/
static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
@@ -3832,44 +4408,25 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
+ bool drop_ref = false;
+
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (btrfs_root_refs(&root->root_item) == 0)
- synchronize_srcu(&fs_info->subvol_srcu);
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- btrfs_free_log(NULL, root);
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ASSERT(root->log_root == NULL);
if (root->reloc_root) {
- free_extent_buffer(root->reloc_root->node);
- free_extent_buffer(root->reloc_root->commit_root);
- btrfs_put_fs_root(root->reloc_root);
+ btrfs_put_root(root->reloc_root);
root->reloc_root = NULL;
}
}
- if (root->free_ino_pinned)
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- if (root->free_ino_ctl)
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
- btrfs_free_fs_root(root);
-}
-
-void btrfs_free_fs_root(struct btrfs_root *root)
-{
- iput(root->ino_cache_inode);
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- if (root->anon_dev)
- free_anon_bdev(root->anon_dev);
- if (root->subv_writers)
- btrfs_free_subvolume_writers(root->subv_writers);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
- btrfs_put_fs_root(root);
+ if (drop_ref)
+ btrfs_put_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3879,15 +4436,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
int i = 0;
int err = 0;
unsigned int ret = 0;
- int index;
while (1) {
- index = srcu_read_lock(&fs_info->subvol_srcu);
+ spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
if (!ret) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
root_objectid = gang[ret - 1]->root_key.objectid + 1;
@@ -3899,9 +4455,9 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
continue;
}
/* grab all the search result for later use */
- gang[i] = btrfs_grab_fs_root(gang[i]);
+ gang[i] = btrfs_grab_root(gang[i]);
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) {
if (!gang[i])
@@ -3910,7 +4466,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
err = btrfs_orphan_cleanup(gang[i]);
if (err)
break;
- btrfs_put_fs_root(gang[i]);
+ btrfs_put_root(gang[i]);
}
root_objectid++;
}
@@ -3918,7 +4474,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
/* release the uncleaned roots due to error */
for (; i < ret; i++) {
if (gang[i])
- btrfs_put_fs_root(gang[i]);
+ btrfs_put_root(gang[i]);
}
return err;
}
@@ -3943,11 +4499,75 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
return btrfs_commit_transaction(trans);
}
+static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_transaction *trans;
+ struct btrfs_transaction *tmp;
+ bool found = false;
+
+ if (list_empty(&fs_info->trans_list))
+ return;
+
+ /*
+ * This function is only called at the very end of close_ctree(),
+ * thus no other running transaction, no need to take trans_lock.
+ */
+ ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
+ list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
+ struct extent_state *cached = NULL;
+ u64 dirty_bytes = 0;
+ u64 cur = 0;
+ u64 found_start;
+ u64 found_end;
+
+ found = true;
+ while (!find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ dirty_bytes += found_end + 1 - found_start;
+ cur = found_end + 1;
+ }
+ btrfs_warn(fs_info,
+ "transaction %llu (with %llu dirty metadata bytes) is not committed",
+ trans->transid, dirty_bytes);
+ btrfs_cleanup_one_transaction(trans, fs_info);
+
+ if (trans == fs_info->running_transaction)
+ fs_info->running_transaction = NULL;
+ list_del_init(&trans->list);
+
+ btrfs_put_transaction(trans);
+ trace_btrfs_transaction_commit(fs_info);
+ }
+ ASSERT(!found);
+}
+
void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+
+ /*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ * We must do this before stopping the block group reclaim task, because
+ * at btrfs_relocate_block_group() we wait for this bit, and after the
+ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+ * return 1.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
+ /*
+ * We may have the reclaim task running and relocating a data block group,
+ * in which case it may create delayed iputs. So stop it before we park
+ * the cleaner kthread otherwise we can get new delayed iputs after
+ * parking the cleaner, and that can make the async reclaim task to hang
+ * if it's waiting for delayed iputs to complete, since the cleaner is
+ * parked and can not run delayed iputs - this will make us hang when
+ * trying to stop the async reclaim task.
+ */
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
/*
* We don't want the cleaner to start new transactions, add more delayed
* iputs, etc. while we're closing. We can't use kthread_stop() yet
@@ -3978,7 +4598,34 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
+ cancel_work_sync(&fs_info->preempt_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -3990,13 +4637,25 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
btrfs_delete_unused_bgs(fs_info);
+ /*
+ * There might be existing delayed inode workers still running
+ * and holding an empty delayed inode item. We must wait for
+ * them to complete first because they can create a transaction.
+ * This happens when someone calls btrfs_balance_delayed_items()
+ * and then a transaction commit runs the same delayed nodes
+ * before any delayed worker has done something with the nodes.
+ * We must wait for any worker here and not at transaction
+ * commit time since that could cause a deadlock.
+ * This is a very rare case.
+ */
+ btrfs_flush_workqueue(fs_info->delayed_workers);
+
ret = btrfs_commit_super(fs_info);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
- test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
@@ -4005,6 +4664,11 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
@@ -4013,15 +4677,13 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- if (percpu_counter_sum(&fs_info->dio_bytes))
+ if (percpu_counter_sum(&fs_info->ordered_bytes))
btrfs_info(fs_info, "at unmount dio bytes count %lld",
- percpu_counter_sum(&fs_info->dio_bytes));
+ percpu_counter_sum(&fs_info->ordered_bytes));
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
- btrfs_free_fs_roots(fs_info);
-
btrfs_put_block_group_cache(fs_info);
/*
@@ -4031,8 +4693,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
+ /* We shouldn't have any transaction open at this point */
+ warn_about_uncommitted_trans(fs_info);
+
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
+ btrfs_free_fs_roots(fs_info);
/*
* We must free the block groups after dropping the fs_roots as we could
@@ -4052,16 +4718,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);
-
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
- percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->dio_bytes);
- percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
- cleanup_srcu_struct(&fs_info->subvol_srcu);
-
- btrfs_free_csum_hash(fs_info);
- btrfs_free_stripe_hash_table(fs_info);
- btrfs_free_ref_cache(fs_info);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -4083,8 +4739,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
@@ -4097,9 +4752,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- root = BTRFS_I(buf->pages[0]->mapping->host)->root;
- fs_info = root->fs_info;
- btrfs_assert_tree_locked(buf);
+ btrfs_assert_tree_write_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
@@ -4155,13 +4808,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key)
-{
- return btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
-}
-
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
@@ -4175,6 +4821,36 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
up_write(&fs_info->cleanup_work_sem);
}
+static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
+
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
+ }
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ btrfs_free_log_root_tree(NULL, fs_info);
+}
+
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
@@ -4235,7 +4911,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_lock(&delayed_refs->lock);
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
- btrfs_info(fs_info, "delayed_refs has NO entry");
+ btrfs_debug(fs_info, "delayed_refs has NO entry");
return ret;
}
@@ -4269,9 +4945,27 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (pin_bytes)
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ if (pin_bytes) {
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+ BUG_ON(!cache);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ cache->space_info, head->num_bytes);
+ cache->reserved -= head->num_bytes;
+ cache->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
@@ -4327,12 +5021,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
spin_unlock(&fs_info->delalloc_root_lock);
btrfs_destroy_delalloc_inodes(root);
- btrfs_put_fs_root(root);
+ btrfs_put_root(root);
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -4373,16 +5067,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
}
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *pinned_extents)
+ struct extent_io_tree *unpin)
{
- struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- bool loop = true;
- unpin = pinned_extents;
-again:
while (1) {
struct extent_state *cached_state = NULL;
@@ -4407,15 +5097,6 @@ again:
cond_resched();
}
- if (loop) {
- if (unpin == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
- loop = false;
- goto again;
- }
-
return 0;
}
@@ -4430,6 +5111,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
@@ -4506,8 +5188,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
- btrfs_destroy_pinned_extent(fs_info,
- fs_info->pinned_extents);
+ btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+
+ btrfs_free_redirty_list(cur_trans);
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
@@ -4552,22 +5235,71 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(t);
- trace_btrfs_transaction_commit(fs_info->tree_root);
+ trace_btrfs_transaction_commit(fs_info);
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
btrfs_destroy_all_ordered_extents(fs_info);
btrfs_destroy_delayed_inodes(fs_info);
btrfs_assert_delayed_root_empty(fs_info);
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
btrfs_destroy_all_delalloc_inodes(fs_info);
+ btrfs_drop_all_logs(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ root->free_objectid = max_t(u64, found_key.objectid + 1,
+ BTRFS_FIRST_FREE_OBJECTID);
+ } else {
+ root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ btrfs_warn(root->fs_info,
+ "the objectid of root %llu reaches its highest value",
+ root->root_key.objectid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = root->free_objectid++;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8c2d6cf1ce59..9fa923e005a3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
#define BTRFS_SUPER_MIRROR_MAX 3
#define BTRFS_SUPER_MIRROR_SHIFT 12
@@ -20,14 +17,6 @@
*/
#define BTRFS_BDEV_BLOCKSIZE (4096)
-enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA,
- BTRFS_WQ_ENDIO_METADATA,
- BTRFS_WQ_ENDIO_FREE_SPACE,
- BTRFS_WQ_ENDIO_RAID56,
- BTRFS_WQ_ENDIO_DIO_REPAIR,
-};
-
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = SZ_16K;
@@ -39,51 +28,64 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key);
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr);
+ u64 bytenr, u64 owner_root,
+ int level);
void btrfs_clean_tree_block(struct extent_buffer *buf);
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
+int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
- struct buffer_head **bh_ret);
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
- struct btrfs_key *location);
-int btrfs_init_fs_root(struct btrfs_root *root);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id);
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *key,
- bool check_ref);
-static inline struct btrfs_root *
-btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location)
-{
- return btrfs_get_fs_root(fs_info, location, true);
-}
-
+ u64 objectid, bool check_ref);
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, dev_t anon_dev);
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ u64 objectid);
+int btrfs_global_root_insert(struct btrfs_root *root);
+void btrfs_global_root_delete(struct btrfs_root *root);
+struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key);
+struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
+
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -95,32 +97,35 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
* If you want to ensure the whole tree is safe, you should use
* fs_info->subvol_srcu
*/
-static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
+ if (!root)
+ return NULL;
if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
-static inline void btrfs_put_fs_root(struct btrfs_root *root)
+static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
- if (refcount_dec_and_test(&root->refs))
- kfree(root);
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
}
+void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
- extent_submit_bio_start_t *submit_bio_start);
+int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
@@ -133,24 +138,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_init_lockdep(void);
-void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level);
-#else
-static inline void btrfs_init_lockdep(void)
-{ }
-static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level)
-{
-}
-#endif
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_init_root_free_objectid(struct btrfs_root *root);
#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 72e312cae69d..fab7eb76e53b 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -57,43 +57,25 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
-static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
- int check_generation)
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u64 generation,
+ int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
- struct btrfs_key key;
- int index;
- int err = 0;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
- key.objectid = root_objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto fail;
- }
-
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
+ root = btrfs_get_fs_root(fs_info, root_objectid, true);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
- inode = btrfs_iget(sb, &key, root);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto fail;
- }
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ inode = btrfs_iget(sb, objectid, root);
+ btrfs_put_root(root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
if (check_generation && generation != inode->i_generation) {
iput(inode);
@@ -101,9 +83,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
}
return d_obtain_alias(inode);
-fail:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -152,7 +131,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
-static struct dentry *btrfs_get_parent(struct dentry *child)
+struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = d_inode(child);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -212,9 +191,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0, 0);
}
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root));
+ return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -245,7 +222,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 57488ecd7d4e..5afb7ca42828 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -18,4 +18,9 @@ struct btrfs_fid {
u64 parent_root_objectid;
} __attribute__ ((packed));
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u64 generation,
+ int check_generation);
+struct dentry *btrfs_get_parent(struct dentry *child);
+
#endif
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644
index 000000000000..83cb0378096f
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.c
@@ -0,0 +1,1674 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ refcount_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode)
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add_state(state) do {} while (0)
+#define btrfs_leak_debug_del_state(state) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data)
+{
+ tree->fs_info = fs_info;
+ tree->state = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ tree->private_data = private_data;
+ tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add_state(state);
+ refcount_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (refcount_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del_state(state);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return 0;
+ if (set && (state->state & bits) == bits)
+ return 0;
+ if (!set && (state->state & bits) == 0)
+ return 0;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_prev(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @node_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct extent_state *entry = NULL;
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ if (node_ret)
+ *node_ret = node;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ /* Search neighbors until we find the first one past the end */
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+
+ return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct extent_state **prev_ret,
+ struct extent_state **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct extent_state *orig_prev;
+ struct extent_state *entry = NULL;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ entry = rb_entry(*node, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ orig_prev = entry;
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+ *next_ret = entry;
+ entry = orig_prev;
+
+ while (entry && offset < entry->start)
+ entry = prev_state(entry);
+ *prev_ret = entry;
+
+ return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree->fs_info, err,
+ "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range. Any
+ * extents with matching state are merged together into a single extent in the
+ * tree. Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *other;
+
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ return;
+
+ other = prev_state(state);
+ if (other && other->end == state->start - 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ other = next_state(state);
+ if (other && other->start == state->end + 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data, state,
+ other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
+ state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+ const u64 end = state->end;
+
+ set_state_bits(tree, state, bits, changeset);
+
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
+
+ if (tree->private_data)
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
+ return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct. It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, int wake,
+ struct extent_changeset *changeset)
+{
+ struct extent_state *next;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+/*
+ * Clear some bits on a range in the tree. This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ u64 last_end;
+ int err;
+ int clear = 0;
+ int wake;
+ int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ refcount_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+
+ /* This search will find the extents that end after our range starts. */
+ state = tree_search(tree, start);
+ if (!state)
+ goto out;
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* The state doesn't have the wanted bits, go ahead. */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we clear the desired bit
+ * on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+{
+ struct extent_state *state;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * This search will find all the extents that end after our
+ * range starts.
+ */
+ state = tree_search(tree, start);
+process_node:
+ if (!state)
+ break;
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ refcount_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ state = next_state(state);
+ goto process_node;
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ refcount_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held. NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, u32 bits)
+{
+ struct extent_state *state;
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, start);
+ while (state) {
+ if (state->end >= start && (state->state & bits))
+ return state;
+ state = next_state(state);
+ }
+ return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ while ((state = next_state(state)) != NULL) {
+ if (state->state & bits)
+ goto got_it;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'. start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ bool found = false;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ if (!state) {
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (state) {
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ found = true;
+ *end = state->end;
+ cur_start = state->end + 1;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+/*
+ * Set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, changeset);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
+ NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
+
+again:
+ if (!prealloc) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(GFP_NOFS);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going.
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, NULL);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ struct extent_state *prev = NULL, *next;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ state = tree_search_prev_next(tree, start, &prev, &next);
+ if (!state && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!state && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ *start_ret = prev->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!state) {
+ state = next;
+ }
+
+ /*
+ * At this point 'state' either contains 'start' or start is
+ * before 'state'
+ */
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as the
+ * beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev)
+ *start_ret = prev->end + 1;
+ else
+ *start_ret = 0;
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (state) {
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set. This
+ * can be fairly slow, except for EXTENT_DIRTY which is cached. The total
+ * number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end <= cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ while (state) {
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ state = cached;
+ else
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ state = next_state(state);
+ }
+
+ /* We ran out of states and were still inside of our range. */
+ if (filled && !state)
+ bitset = 0;
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
+ GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+ changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ NULL, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ cached_state, NULL, GFP_NOFS);
+ while (err == -EEXIST) {
+ if (failed_start != start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, cached_state);
+
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ &failed_start, cached_state, NULL,
+ GFP_NOFS);
+ }
+ return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index a3febe746c79..a855f40dd61d 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,14 +17,35 @@ struct io_failure_record;
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+/*
+ * Must be cleared only during ordered extent completion or on error paths if we
+ * did not manage to submit bios and create the ordered extents for the range.
+ * Should not be cleared during page release and page invalidation (if there is
+ * an ordered extent in flight), that is left for the ordered extent completion.
+ */
#define EXTENT_DELALLOC_NEW (1U << 14)
+/*
+ * When an ordered extent successfully completes for a region marked as a new
+ * delalloc range, use this flag when clearing a new delalloc range to indicate
+ * that the VFS' inode number of bytes should be incremented and the inode's new
+ * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
+ */
+#define EXTENT_ADD_INODE_BYTES (1U << 15)
+
+/*
+ * Set during truncate when we're clearing an entire range and we just want the
+ * extent states to go away.
+ */
+#define EXTENT_CLEAR_ALL_BITS (1U << 16)
+
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
+ EXTENT_ADD_INODE_BYTES | \
+ EXTENT_CLEAR_ALL_BITS)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -34,30 +55,32 @@ struct io_failure_record;
*/
#define CHUNK_ALLOCATED EXTENT_DIRTY
#define CHUNK_TRIMMED EXTENT_DEFRAG
+#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
+ CHUNK_TRIMMED)
enum {
- IO_TREE_FS_INFO_FREED_EXTENTS0,
- IO_TREE_FS_INFO_FREED_EXTENTS1,
+ IO_TREE_FS_PINNED_EXTENTS,
+ IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
+ IO_TREE_INODE_FILE_EXTENT,
+ IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
spinlock_t lock;
- const struct extent_io_ops *ops;
};
struct extent_state {
@@ -68,110 +91,95 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
refcount_t refs;
- unsigned state;
-
- struct io_failure_record *failrec;
+ u32 state;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
-int __init extent_state_cache_init(void);
-void __cold extent_state_cache_exit(void);
-
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
void extent_io_tree_release(struct extent_io_tree *tree);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned bits, int contig);
+ u64 max_bytes, u32 bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled,
- struct extent_state *cached_state);
+ u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached);
+ u32 bits, struct extent_changeset *changeset);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
+ u32 bits, struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+ return __clear_extent_bit(tree, start, end, bits, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
+static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
+ u64 end, u32 bits)
{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+ return clear_extent_bit(tree, start, end, bits, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
+ u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits);
+ u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
+ u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -179,70 +187,53 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
+ EXTENT_DO_ACCOUNTING, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned int extra_bits,
+ u64 end, u32 extra_bits,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- NULL, cached_state, GFP_NOFS);
+ EXTENT_DELALLOC | extra_bits,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, GFP_NOFS);
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
- GFP_NOFS);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
+ u64 *start_ret, u64 *end_ret, u32 bits);
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-
-/* This should be reworked in the future and put elsewhere. */
-int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec);
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a7bc66121330..2801c991814f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,9 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "rcu-string.h"
+#include "zoned.h"
+#include "dev-replace.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -64,10 +67,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
- set_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- set_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
+ set_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
return 0;
}
@@ -79,50 +80,14 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
start = cache->start;
end = start + cache->length - 1;
- clear_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- clear_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
-}
-
-static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
-{
- if (ref->type == BTRFS_REF_METADATA) {
- if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
- return BTRFS_BLOCK_GROUP_SYSTEM;
- else
- return BTRFS_BLOCK_GROUP_METADATA;
- }
- return BTRFS_BLOCK_GROUP_DATA;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
}
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
+ struct btrfs_root *root = btrfs_extent_root(fs_info, start);
int ret;
struct btrfs_key key;
struct btrfs_path *path;
@@ -134,7 +99,7 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
key.objectid = start;
key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
btrfs_free_path(path);
return ret;
}
@@ -152,6 +117,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags)
{
+ struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
@@ -189,7 +155,8 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ extent_root = btrfs_extent_root(fs_info, bytenr);
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
@@ -207,7 +174,7 @@ search_again:
if (ret == 0) {
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (item_size >= sizeof(*ei)) {
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
@@ -403,12 +370,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -417,12 +383,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -432,8 +397,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -480,7 +446,7 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
u64 root_objectid,
u64 owner, u64 offset)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
@@ -556,7 +522,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner,
u64 offset, int refs_to_add)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
u32 size;
@@ -630,8 +596,9 @@ fail:
}
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop, int *last_ref)
+ int refs_to_drop)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -663,8 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
num_refs -= refs_to_drop;
if (num_refs == 0) {
- ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
- *last_ref = 1;
+ ret = btrfs_del_item(trans, root, path);
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -722,7 +688,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 parent,
u64 root_objectid)
{
- struct btrfs_root *root = trans->fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
@@ -746,6 +712,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 parent,
u64 root_objectid)
{
+ struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
struct btrfs_key key;
int ret;
@@ -758,8 +725,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
key.offset = root_objectid;
}
- ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
- path, &key, 0);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_release_path(path);
return ret;
}
@@ -824,7 +790,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
u64 owner, u64 offset, int insert)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -848,6 +814,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
want = extent_ref_type(parent, owner);
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
+ path->search_for_extension = 1;
path->keep_locks = 1;
} else
extra_size = -1;
@@ -901,7 +868,7 @@ again:
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
btrfs_print_v0_err(fs_info);
@@ -928,7 +895,13 @@ again:
err = -ENOENT;
while (1) {
if (ptr >= end) {
- WARN_ON(ptr > end);
+ if (ptr > end) {
+ err = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ }
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
@@ -1000,6 +973,7 @@ again:
out:
if (insert) {
path->keep_locks = 0;
+ path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
return err;
@@ -1042,7 +1016,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
__run_delayed_extent_op(extent_op, leaf, ei);
ptr = (unsigned long)ei + item_offset;
- end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+ end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
if (ptr < end - size)
memmove_extent_buffer(leaf, ptr + size, ptr,
end - size - ptr);
@@ -1103,8 +1077,7 @@ static noinline_for_stack
void update_inline_extent_backref(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op,
- int *last_ref)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_item *ei;
@@ -1152,9 +1125,8 @@ void update_inline_extent_backref(struct btrfs_path *path,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
- *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = (unsigned long)iref;
end = (unsigned long)ei + item_size;
if (ptr + size < end)
@@ -1181,9 +1153,23 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
- update_inline_extent_backref(path, iref, refs_to_add,
- extent_op, NULL);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
+ update_inline_extent_backref(path, iref, refs_to_add, extent_op);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
root_objectid, owner, offset,
@@ -1193,42 +1179,21 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- u64 bytenr, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add)
-{
- int ret;
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- BUG_ON(refs_to_add != 1);
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
- } else {
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
- }
- return ret;
-}
-
static int remove_extent_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data, int *last_ref)
+ int refs_to_drop, int is_data)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
- if (iref) {
- update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
- last_ref);
- } else if (is_data) {
- ret = remove_extent_data_ref(trans, path, refs_to_drop,
- last_ref);
- } else {
- *last_ref = 1;
- ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
- }
+ if (iref)
+ update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ else if (is_data)
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ else
+ ret = btrfs_del_item(trans, root, path);
return ret;
}
@@ -1280,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (size) {
ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += size;
else if (ret != -EOPNOTSUPP)
@@ -1297,13 +1262,53 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (bytes_left) {
ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += bytes_left;
}
return ret;
}
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
+{
+ struct btrfs_device *dev = stripe->dev;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 phys = stripe->physical;
+ u64 len = stripe->length;
+ u64 discarded = 0;
+ int ret = 0;
+
+ /* Zone reset on a zoned filesystem */
+ if (btrfs_can_zone_reset(dev, phys, len)) {
+ u64 src_disc;
+
+ ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
+ if (ret)
+ goto out;
+
+ if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
+ dev != dev_replace->srcdev)
+ goto out;
+
+ src_disc = discarded;
+
+ /* Send to replace target as well */
+ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
+ &discarded);
+ discarded += src_disc;
+ } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
+ ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
+ } else {
+ ret = 0;
+ *bytes = 0;
+ }
+
+out:
+ *bytes = discarded;
+ return ret;
+}
+
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
@@ -1311,80 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_bio *bbio = NULL;
-
/*
- * Avoid races with device replace and make sure our bbio has devices
- * associated to its stripes that don't go away while we are discarding.
+ * Avoid races with device replace and make sure the devices in the
+ * stripes don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_discard_stripe *stripes;
+ unsigned int num_stripes;
int i;
num_bytes = end - cur;
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bbio, 0);
- /*
- * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
- * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
- * thus we can't continue anyway.
- */
- if (ret < 0)
- goto out;
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ if (IS_ERR(stripes)) {
+ ret = PTR_ERR(stripes);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ break;
+ }
- stripe = bbio->stripes;
- for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_discard_stripe *stripe = stripes + i;
u64 bytes;
- struct request_queue *req_q;
if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- req_q = bdev_get_queue(stripe->dev->bdev);
- if (!blk_queue_discard(req_q))
+
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &stripe->dev->dev_state))
continue;
- ret = btrfs_issue_discard(stripe->dev->bdev,
- stripe->physical,
- stripe->length,
- &bytes);
- if (!ret) {
- discarded_bytes += bytes;
- } else if (ret != -EOPNOTSUPP) {
+ ret = do_discard_extent(stripe, &bytes);
+ if (ret) {
/*
- * Logic errors or -ENOMEM, or -EIO, but
- * unlikely to happen.
- *
- * And since there are two loops, explicitly
- * go to out to avoid confusion.
+ * Keep going if discard is not supported by the
+ * device.
*/
- btrfs_put_bbio(bbio);
- goto out;
+ if (ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ } else {
+ discarded_bytes += bytes;
}
-
- /*
- * Just in case we get back EOPNOTSUPP for some reason,
- * just ignore the return value so we don't screw up
- * people calling discard_extent.
- */
- ret = 0;
}
- btrfs_put_bbio(bbio);
+ kfree(stripes);
+ if (ret)
+ break;
cur += num_bytes;
}
-out:
btrfs_bio_counter_dec(fs_info);
-
if (actual_bytes)
*actual_bytes = discarded_bytes;
-
-
- if (ret == -EOPNOTSUPP)
- ret = 0;
return ret;
}
@@ -1393,32 +1378,29 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
- ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
- NULL, &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
else
- ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
btrfs_ref_tree_mod(fs_info, generic_ref);
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- sub_pinned_bytes(fs_info, generic_ref);
-
return ret;
}
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -1429,7 +1411,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* bytenr of the parent block. Since new extents are always
* created with indirect references, this will only be the case
* when relocating a shared extent. In that case, root_objectid
- * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
+ * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
* be 0
*
* @root_objectid: The id of the root where this modification has originated,
@@ -1469,8 +1451,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
- path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -1494,11 +1474,16 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = READA_FORWARD;
- path->leave_spinning = 1;
/* now insert the actual backref */
- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
- owner, offset, refs_to_add);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
+ root_objectid, owner, offset,
+ refs_to_add);
+ }
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -1574,6 +1559,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -1581,12 +1567,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
- int metadata = !extent_op->is_data;
+ int metadata = 1;
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
- if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
@@ -1603,10 +1589,9 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
key.offset = head->num_bytes;
}
+ root = btrfs_extent_root(fs_info, key.objectid);
again:
- path->reada = READA_FORWARD;
- path->leave_spinning = 1;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
goto out;
@@ -1638,7 +1623,7 @@ again:
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
@@ -1703,10 +1688,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
- if (trans->aborted) {
+ if (TRANS_ABORTED(trans)) {
if (insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return 0;
}
@@ -1721,8 +1705,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
else
BUG();
if (ret && insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return ret;
}
@@ -1798,34 +1781,15 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
{
int nr_items = 1; /* Dropping this ref head update. */
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
-
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-
- /*
- * We had csum deletions accounted for in our delayed refs rsv,
- * we need to drop the csum leaves for this update from our
- * delayed_refs_rsv.
- */
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info,
- head->num_bytes);
- }
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv, we need
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
+ */
+ if (head->total_ref_mod < 0 && head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
@@ -1867,11 +1831,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info->csum_root,
- head->bytenr, head->num_bytes);
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(fs_info, head->bytenr);
+ ret = btrfs_del_csums(trans, csum_root, head->bytenr,
+ head->num_bytes);
}
}
@@ -1880,7 +1846,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
@@ -2135,41 +2101,6 @@ static u64 find_middle(struct rb_root *root)
}
#endif
-static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
-{
- u64 num_bytes;
-
- num_bytes = heads * (sizeof(struct btrfs_extent_item) +
- sizeof(struct btrfs_extent_inline_ref));
- if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- num_bytes += heads * sizeof(struct btrfs_tree_block_info);
-
- /*
- * We don't ever fill up leaves all the way so multiply by 2 just to be
- * closer to what we're really going to want to use.
- */
- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
-}
-
-/*
- * Takes the number of bytes to be csumm'ed and figures out how many leaves it
- * would require to store the csums for that many bytes.
- */
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- u64 csum_size;
- u64 num_csums_per_leaf;
- u64 num_csums;
-
- csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
- num_csums_per_leaf = div64_u64(csum_size,
- (u64)btrfs_super_csum_size(fs_info->super_copy));
- num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
- num_csums += num_csums_per_leaf - 1;
- num_csums = div64_u64(num_csums, num_csums_per_leaf);
- return num_csums;
-}
-
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2191,7 +2122,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
@@ -2199,7 +2130,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0)
- count = atomic_read(&delayed_refs->num_entries) * 2;
+ count = delayed_refs->num_heads_ready;
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -2238,8 +2169,8 @@ out:
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
- int level, int is_data)
+ struct extent_buffer *eb, u64 flags,
+ int level)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@@ -2251,10 +2182,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->is_data = is_data ? true : false;
extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -2290,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
+ if (path->nowait) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
@@ -2342,10 +2278,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct extent_buffer *leaf;
struct btrfs_extent_data_ref *ref;
struct btrfs_extent_inline_ref *iref;
@@ -2376,7 +2313,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
goto out;
ret = 1;
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
/* If extent item has more than 1 inline ref then it's shared */
@@ -2384,9 +2321,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2411,18 +2352,13 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict, struct btrfs_path *path)
{
- struct btrfs_path *path;
int ret;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
@@ -2430,8 +2366,8 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
} while (ret == -EAGAIN);
out:
- btrfs_free_path(path);
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ btrfs_release_path(path);
+ if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
}
@@ -2463,7 +2399,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
return 0;
if (full_backref)
@@ -2493,10 +2429,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
- key.offset);
- generic_ref.skip_qgroup = for_reloc;
+ key.offset, root->root_key.objectid,
+ for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2508,9 +2443,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = fs_info->nodesize;
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
- generic_ref.skip_qgroup = for_reloc;
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+ root->root_key.objectid, for_reloc);
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
@@ -2536,19 +2470,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
-int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct btrfs_block_group *block_group;
- int readonly = 0;
-
- block_group = btrfs_lookup_block_group(fs_info, bytenr);
- if (!block_group || block_group->ro)
- readonly = 1;
- if (block_group)
- btrfs_put_block_group(block_group);
- return readonly;
-}
-
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2566,29 +2487,27 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
return ret;
}
-static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
+static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group *cache;
- u64 bytenr;
+ struct rb_node *leftmost;
+ u64 bytenr = 0;
- spin_lock(&fs_info->block_group_cache_lock);
- bytenr = fs_info->first_logical_byte;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
+ /* Get the block group with the lowest logical start address. */
+ leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+ if (leftmost) {
+ struct btrfs_block_group *bg;
- if (bytenr < (u64)-1)
- return bytenr;
-
- cache = btrfs_lookup_first_block_group(fs_info, search_start);
- if (!cache)
- return 0;
-
- bytenr = cache->start;
- btrfs_put_block_group(cache);
+ bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+ bytenr = bg->start;
+ }
+ read_unlock(&fs_info->block_group_cache_lock);
return bytenr;
}
-static int pin_down_extent(struct btrfs_block_group *cache,
+static int pin_down_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -2605,24 +2524,20 @@ static int pin_down_extent(struct btrfs_block_group *cache,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(fs_info->pinned_extents, bytenr,
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_block_group *cache;
- ASSERT(fs_info->running_transaction);
-
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(cache, bytenr, num_bytes, reserved);
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@@ -2631,28 +2546,29 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
/*
- * pull in the free space cache (if any) so that our pin
- * removes the free space from the cache. We have load_only set
- * to one because the slow code to read in the free extents does check
- * the pinned extents.
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
*/
- btrfs_cache_block_group(cache, 1);
+ ret = btrfs_cache_block_group(cache, true);
+ if (ret)
+ goto out;
- pin_down_extent(cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+out:
btrfs_put_block_group(cache);
return ret;
}
@@ -2662,45 +2578,17 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
{
int ret;
struct btrfs_block_group *block_group;
- struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 0);
- caching_ctl = btrfs_get_caching_control(block_group);
-
- if (!caching_ctl) {
- /* Logic error */
- BUG_ON(!btrfs_block_group_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- } else {
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- if (ret)
- goto out_lock;
+ ret = btrfs_cache_block_group(block_group, true);
+ if (ret)
+ goto out;
- num_bytes = (start + num_bytes) -
- caching_ctl->progress;
- start = caching_ctl->progress;
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- }
-out_lock:
- mutex_unlock(&caching_ctl->mutex);
- btrfs_put_caching_control(caching_ctl);
- }
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+out:
btrfs_put_block_group(block_group);
return ret;
}
@@ -2743,36 +2631,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
atomic_inc(&bg->reservations);
}
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_caching_control *next;
- struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group *cache;
-
- down_write(&fs_info->commit_root_sem);
-
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- cache = caching_ctl->block_group;
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
-
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- fs_info->pinned_extents = &fs_info->freed_extents[1];
- else
- fs_info->pinned_extents = &fs_info->freed_extents[0];
-
- up_write(&fs_info->commit_root_sem);
-
- btrfs_update_global_block_rsv(fs_info);
-}
-
/*
* Returns the free cluster for the given space info and sets empty_cluster to
* what it should be based on the mount options.
@@ -2834,11 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- if (start < cache->last_byte_to_unpin) {
- len = min(len, cache->last_byte_to_unpin - start);
- if (return_free_space)
- btrfs_add_free_space(cache, start, len);
- }
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
start += len;
total_unpinned += len;
@@ -2862,21 +2717,22 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
+ } else if (btrfs_is_zoned(fs_info)) {
+ /* Need reset before reusing in a zoned block group */
+ space_info->bytes_zone_unusable += len;
+ readonly = true;
}
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
global_rsv->space_info == space_info) {
- u64 to_add = len;
-
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- to_add = min(len, global_rsv->size -
- global_rsv->reserved);
+ u64 to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(fs_info,
space_info, to_add);
@@ -2885,11 +2741,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2908,12 +2763,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 end;
int ret;
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
+ unpin = &trans->transaction->pinned_extents;
- while (!trans->aborted) {
+ while (!TRANS_ABORTED(trans)) {
struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
@@ -2950,14 +2802,14 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 trimmed = 0;
ret = -EROFS;
- if (!trans->aborted)
+ if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
block_group->start,
block_group->length,
&trimmed);
list_del_init(&block_group->bg_list);
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
if (ret) {
@@ -2971,6 +2823,94 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool is_data)
+{
+ int ret;
+
+ if (is_data) {
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(trans->fs_info, bytenr);
+ ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -2980,7 +2920,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_root *extent_root = info->extent_root;
+ struct btrfs_root *extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -2993,18 +2933,25 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
- int last_ref = 0;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+ extent_root = btrfs_extent_root(info, bytenr);
+ ASSERT(extent_root);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
- path->leave_spinning = 1;
-
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -3013,6 +2960,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -3029,23 +2983,30 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
- ret = remove_extent_backref(trans, path, NULL,
- refs_to_drop,
- is_data, &last_ref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
+ ret = remove_extent_backref(trans, extent_root, path,
+ NULL, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
- path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3108,7 +3069,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, extent_slot);
+ item_size = btrfs_item_size(leaf, extent_slot);
if (unlikely(item_size < sizeof(*ei))) {
ret = -EINVAL;
btrfs_print_v0_err(info);
@@ -3120,19 +3081,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3144,34 +3112,63 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
}
if (found_extent) {
- ret = remove_extent_backref(trans, path, iref,
- refs_to_drop, is_data,
- &last_ref);
+ ret = remove_extent_backref(trans, extent_root, path,
+ iref, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
}
- last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -3180,32 +3177,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- if (is_data) {
- ret = btrfs_del_csums(trans, info->csum_root, bytenr,
- num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- }
-
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
}
btrfs_release_path(path);
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3264,44 +3255,67 @@ out_delayed_unlock:
}
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+ u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_ref generic_ref = { 0 };
- int pin = 1;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
- root->root_key.objectid);
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- int old_ref_mod, new_ref_mod;
+ root_id, 0, false);
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group *cache;
+ bool must_pin = false;
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
- if (!ret)
+ if (!ret) {
+ btrfs_redirty_list_add(trans->transaction, buf);
goto out;
+ }
}
- pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(cache, buf->start, buf->len, 1);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ must_pin = true;
+
+ if (must_pin || btrfs_is_zoned(fs_info)) {
+ btrfs_redirty_list_add(trans->transaction, buf);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@@ -3314,9 +3328,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
}
out:
- if (pin)
- add_pinned_bytes(fs_info, &generic_ref);
-
if (last_ref) {
/*
* Deleting the buffer, clear the corrupt flag since it doesn't
@@ -3330,7 +3341,6 @@ out:
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
@@ -3341,30 +3351,24 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
- btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
- old_ref_mod = new_ref_mod = 0;
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
- ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
} else {
- ret = btrfs_add_delayed_data_ref(trans, ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, ref);
-
return ret;
}
@@ -3395,6 +3399,7 @@ static struct btrfs_block_group *btrfs_lock_cluster(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
+ __acquires(&cluster->refill_lock)
{
struct btrfs_block_group *used_bg = NULL;
@@ -3438,13 +3443,20 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+enum btrfs_extent_allocation_policy {
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
+ BTRFS_EXTENT_ALLOC_ZONED,
+};
+
/*
* Structure used internally for find_free_extent() function. Wraps needed
* parameters.
*/
struct find_free_extent_ctl {
/* Basic allocation info */
+ u64 ram_bytes;
u64 num_bytes;
+ u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
@@ -3454,10 +3466,18 @@ struct find_free_extent_ctl {
/* For clustered allocation */
u64 empty_cluster;
+ struct btrfs_free_cluster *last_ptr;
+ bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
+ /* Allocation is called for tree-log */
+ bool for_treelog;
+
+ /* Allocation is called for data relocation */
+ bool for_data_reloc;
+
/* RAID index, converted from flags */
int index;
@@ -3489,6 +3509,12 @@ struct find_free_extent_ctl {
/* Found result */
u64 found_offset;
+
+ /* Hint where to start looking for an empty space */
+ u64 hint_byte;
+
+ /* Allocation policy */
+ enum btrfs_extent_allocation_policy policy;
};
@@ -3501,11 +3527,11 @@ struct find_free_extent_ctl {
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
static int find_free_extent_clustered(struct btrfs_block_group *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group **cluster_bg_ret)
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **cluster_bg_ret)
{
struct btrfs_block_group *cluster_bg;
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 aligned_cluster;
u64 offset;
int ret;
@@ -3605,9 +3631,9 @@ refill_cluster:
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl)
{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 offset;
/*
@@ -3663,40 +3689,382 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
return 0;
}
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ int ret;
+
+ /* We want to try and use the cluster allocator, so lets look there */
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ /* ret == -ENOENT case falls through */
+ }
+
+ return find_free_extent_unclustered(block_group, ffe_ctl);
+}
+
+/*
+ * Tree-log block group locking
+ * ============================
+ *
+ * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
+ * indicates the starting address of a block group, which is reserved only
+ * for tree-log metadata.
+ *
+ * Lock nesting
+ * ============
+ *
+ * space_info::lock
+ * block_group::lock
+ * fs_info::treelog_bg_lock
+ */
+
+/*
+ * Simple allocator for sequential-only block group. It only allows sequential
+ * allocation. No need to play with trees. This function also reserves the
+ * bytes as in btrfs_add_reserved_bytes.
+ */
+static int do_allocation_zoned(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 start = block_group->start;
+ u64 num_bytes = ffe_ctl->num_bytes;
+ u64 avail;
+ u64 bytenr = block_group->start;
+ u64 log_bytenr;
+ u64 data_reloc_bytenr;
+ int ret = 0;
+ bool skip = false;
+
+ ASSERT(btrfs_is_zoned(block_group->fs_info));
+
+ /*
+ * Do not allow non-tree-log blocks in the dedicated tree-log block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->treelog_bg_lock);
+ log_bytenr = fs_info->treelog_bg;
+ if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ if (skip)
+ return 1;
+
+ /*
+ * Do not allow non-relocation blocks in the dedicated relocation block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->relocation_bg_lock);
+ data_reloc_bytenr = fs_info->data_reloc_bg;
+ if (data_reloc_bytenr &&
+ ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+ (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+ skip = true;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
+
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
+ spin_unlock(&block_group->lock);
+
+ if (!ret && !btrfs_zone_activate(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+ * Return the error after taking the locks.
+ */
+ }
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
+
+ if (ret)
+ goto out;
+
+ ASSERT(!ffe_ctl->for_treelog ||
+ block_group->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
+ ASSERT(!ffe_ctl->for_data_reloc ||
+ block_group->start == fs_info->data_reloc_bg ||
+ fs_info->data_reloc_bg == 0);
+
+ if (block_group->ro ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently using block group to be tree-log dedicated
+ * block group.
+ */
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently used block group to be the data relocation
+ * dedicated block group.
+ */
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+ avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (avail < num_bytes) {
+ if (ffe_ctl->max_extent_size < avail) {
+ /*
+ * With sequential allocator, free space is always
+ * contiguous
+ */
+ ffe_ctl->max_extent_size = avail;
+ ffe_ctl->total_free_space = avail;
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
+ fs_info->treelog_bg = block_group->start;
+
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+ fs_info->data_reloc_bg = block_group->start;
+
+ ffe_ctl->found_offset = start + block_group->alloc_offset;
+ block_group->alloc_offset += num_bytes;
+ spin_lock(&ctl->tree_lock);
+ ctl->free_space -= num_bytes;
+ spin_unlock(&ctl->tree_lock);
+
+ /*
+ * We do not check if found_offset is aligned to stripesize. The
+ * address is anyway rewritten when using zone append writing.
+ */
+
+ ffe_ctl->search_start = ffe_ctl->found_offset;
+
+out:
+ if (ret && ffe_ctl->for_treelog)
+ fs_info->treelog_bg = 0;
+ if (ret && ffe_ctl->for_data_reloc &&
+ fs_info->data_reloc_bg == block_group->start) {
+ /*
+ * Do not allow further allocations from this block group.
+ * Compared to increasing the ->ro, setting the
+ * ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ */
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
+ fs_info->data_reloc_bg = 0;
+ }
+ spin_unlock(&fs_info->relocation_bg_lock);
+ spin_unlock(&fs_info->treelog_bg_lock);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
+static int do_allocation(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
+ default:
+ BUG();
+ }
+}
+
+static void release_block_group(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ int delalloc)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ ffe_ctl->index);
+ btrfs_release_block_group(block_group, delalloc);
+}
+
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ if (!ffe_ctl->use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+}
+
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ found_extent_clustered(ffe_ctl, ins);
+ break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ /* If we can activate new zone, just allocate a chunk and use it */
+ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return 0;
+
+ /*
+ * We already reached the max active zones. Try to finish one block
+ * group to make a room for a new block group. This is only possible
+ * for a data block group because btrfs_zone_finish() may need to wait
+ * for a running transaction which can cause a deadlock for metadata
+ * allocation.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int ret = btrfs_zone_finish_one_bg(fs_info);
+
+ if (ret == 1)
+ return 0;
+ else if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * If we have enough free space left in an already active block group
+ * and we can't activate any other zone now, do not allow allocating a
+ * new chunk and let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+ return -ENOSPC;
+
+ /*
+ * Even min_alloc_size is not left in any block groups. Since we cannot
+ * activate a new block group, allocating it may not help. Let's tell a
+ * caller to try again and hope it progress something by writing some
+ * parts of the region. That is only possible for data block groups,
+ * where a part of the region can be written.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+ return -EAGAIN;
+
+ /*
+ * We cannot activate a new block group and no enough space left in any
+ * block groups. So, allocating a new block group may not help. But,
+ * there is nothing to do anyway, so let's go with it.
+ */
+ return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return can_allocate_chunk_zoned(fs_info, ffe_ctl);
+ default:
+ BUG();
+ }
+}
+
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ /*
+ * If we can't allocate a new chunk we've already looped through
+ * at least once, move on to the NO_EMPTY_SIZE case.
+ */
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+ return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Give up here */
+ return -ENOSPC;
+ default:
+ BUG();
+ }
+}
+
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
* Return <0 means we failed to locate any free extent.
*/
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- struct btrfs_free_cluster *last_ptr,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
- int full_search, bool use_cluster)
+ bool full_search)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->chunk_root;
int ret;
if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
ffe_ctl->orig_have_caching_bg = true;
- if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
- ffe_ctl->have_caching_bg)
- return 1;
-
- if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
- return 1;
-
if (ins->objectid) {
- if (!use_cluster && last_ptr) {
- spin_lock(&last_ptr->lock);
- last_ptr->window_start = ins->objectid;
- spin_unlock(&last_ptr->lock);
- }
+ found_extent(ffe_ctl, ins);
return 0;
}
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+ ffe_ctl->index++;
+ if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+ return 1;
+
/*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
@@ -3725,6 +4093,11 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
int exist = 0;
+ /*Check if allocation policy allows to create a new chunk */
+ ret = can_allocate_chunk(fs_info, ffe_ctl);
+ if (ret)
+ return ret;
+
trans = current->journal_info;
if (trans)
exist = 1;
@@ -3737,18 +4110,12 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
- CHUNK_ALLOC_FORCE);
-
- /*
- * If we can't allocate a new chunk we've already looped
- * through at least once, move on to the NO_EMPTY_SIZE
- * case.
- */
- if (ret == -ENOSPC)
- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+ CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
- if (ret < 0 && ret != -ENOSPC)
+ if (ret == -ENOSPC)
+ ret = chunk_allocation_failed(ffe_ctl);
+ else if (ret < 0)
btrfs_abort_transaction(trans, ret);
else
ret = 0;
@@ -3759,6 +4126,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
+ return -ENOSPC;
+
/*
* Don't loop again if we already have no empty_size and
* no empty_cluster.
@@ -3774,6 +4144,85 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ /*
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
+ */
+ if (space_info->max_extent_size) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
+ }
+
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl->empty_cluster);
+ if (ffe_ctl->last_ptr) {
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&last_ptr->lock);
+ }
+
+ return 0;
+}
+
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
+ space_info, ins);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
+ if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ }
+ return 0;
+ default:
+ BUG();
+ }
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -3799,93 +4248,63 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
- u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte, struct btrfs_key *ins,
- u64 flags, int delalloc)
+static noinline int find_free_extent(struct btrfs_root *root,
+ struct btrfs_key *ins,
+ struct find_free_extent_ctl *ffe_ctl)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
- struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group *block_group = NULL;
- struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
- bool use_cluster = true;
bool full_search = false;
- WARN_ON(num_bytes < fs_info->sectorsize);
+ WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
- ffe_ctl.num_bytes = num_bytes;
- ffe_ctl.empty_size = empty_size;
- ffe_ctl.flags = flags;
- ffe_ctl.search_start = 0;
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- ffe_ctl.delalloc = delalloc;
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
- ffe_ctl.have_caching_bg = false;
- ffe_ctl.orig_have_caching_bg = false;
- ffe_ctl.found_offset = 0;
+ ffe_ctl->search_start = 0;
+ /* For clustered allocation */
+ ffe_ctl->empty_cluster = 0;
+ ffe_ctl->last_ptr = NULL;
+ ffe_ctl->use_cluster = true;
+ ffe_ctl->have_caching_bg = false;
+ ffe_ctl->orig_have_caching_bg = false;
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+ ffe_ctl->loop = 0;
+ /* For clustered allocation */
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ ffe_ctl->cached = 0;
+ ffe_ctl->max_extent_size = 0;
+ ffe_ctl->total_free_space = 0;
+ ffe_ctl->found_offset = 0;
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+
+ if (btrfs_is_zoned(fs_info))
+ ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+ ffe_ctl->flags);
- space_info = btrfs_find_space_info(fs_info, flags);
+ space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", flags);
+ btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
return -ENOSPC;
}
- /*
- * If our free space is heavily fragmented we may not be able to make
- * big contiguous allocations, so instead of doing the expensive search
- * for free space, simply return ENOSPC with our max_extent_size so we
- * can go ahead and search for a more manageable chunk.
- *
- * If our max_extent_size is large enough for our allocation simply
- * disable clustering since we will likely not be able to find enough
- * space to create a cluster and induce latency trying.
- */
- if (unlikely(space_info->max_extent_size)) {
- spin_lock(&space_info->lock);
- if (space_info->max_extent_size &&
- num_bytes > space_info->max_extent_size) {
- ins->offset = space_info->max_extent_size;
- spin_unlock(&space_info->lock);
- return -ENOSPC;
- } else if (space_info->max_extent_size) {
- use_cluster = false;
- }
- spin_unlock(&space_info->lock);
- }
-
- last_ptr = fetch_cluster_info(fs_info, space_info,
- &ffe_ctl.empty_cluster);
- if (last_ptr) {
- spin_lock(&last_ptr->lock);
- if (last_ptr->block_group)
- hint_byte = last_ptr->window_start;
- if (last_ptr->fragmented) {
- /*
- * We still set window_start so we can keep track of the
- * last place we found an allocation to try and save
- * some time.
- */
- hint_byte = last_ptr->window_start;
- use_cluster = false;
- }
- spin_unlock(&last_ptr->lock);
- }
+ ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
+ if (ret < 0)
+ return ret;
- ffe_ctl.search_start = max(ffe_ctl.search_start,
- first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
- if (ffe_ctl.search_start == hint_byte) {
+ ffe_ctl->search_start = max(ffe_ctl->search_start,
+ first_logical_byte(fs_info));
+ ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+ if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
- ffe_ctl.search_start);
+ ffe_ctl->search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -3893,7 +4312,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
- if (block_group && block_group_bits(block_group, flags) &&
+ if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -3907,9 +4326,10 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- ffe_ctl.index = btrfs_bg_flags_to_raid_index(
- block_group->flags);
- btrfs_lock_block_group(block_group, delalloc);
+ ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
+ btrfs_lock_block_group(block_group,
+ ffe_ctl->delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -3917,26 +4337,33 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
}
search:
- ffe_ctl.have_caching_bg = false;
- if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
- ffe_ctl.index == 0)
+ ffe_ctl->have_caching_bg = false;
+ if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+ ffe_ctl->index == 0)
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
- &space_info->block_groups[ffe_ctl.index], list) {
+ &space_info->block_groups[ffe_ctl->index], list) {
+ struct btrfs_block_group *bg_ret;
+
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro))
+ if (unlikely(block_group->ro)) {
+ if (ffe_ctl->for_treelog)
+ btrfs_clear_treelog_bg(block_group);
+ if (ffe_ctl->for_data_reloc)
+ btrfs_clear_data_reloc_bg(block_group);
continue;
+ }
- btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->start;
+ btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+ ffe_ctl->search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
- if (!block_group_bits(block_group, flags)) {
+ if (!block_group_bits(block_group, ffe_ctl->flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -3947,7 +4374,7 @@ search:
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
- if ((flags & extra) && !(block_group->flags & extra))
+ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
goto loop;
/*
@@ -3955,15 +4382,15 @@ search:
* It's possible that we have MIXED_GROUP flag but no
* block group is mixed. Just skip such block group.
*/
- btrfs_release_block_group(block_group, delalloc);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
continue;
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_done(block_group);
- if (unlikely(!ffe_ctl.cached)) {
- ffe_ctl.have_caching_bg = true;
- ret = btrfs_cache_block_group(block_group, 0);
+ ffe_ctl->cached = btrfs_block_group_done(block_group);
+ if (unlikely(!ffe_ctl->cached)) {
+ ffe_ctl->have_caching_bg = true;
+ ret = btrfs_cache_block_group(block_group, false);
/*
* If we get ENOMEM here or something else we want to
@@ -3984,83 +4411,64 @@ have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
- /*
- * Ok we want to try and use the cluster allocator, so
- * lets look there
- */
- if (last_ptr && use_cluster) {
- struct btrfs_block_group *cluster_bg = NULL;
-
- ret = find_free_extent_clustered(block_group, last_ptr,
- &ffe_ctl, &cluster_bg);
-
- if (ret == 0) {
- if (cluster_bg && cluster_bg != block_group) {
- btrfs_release_block_group(block_group,
- delalloc);
- block_group = cluster_bg;
- }
- goto checks;
- } else if (ret == -EAGAIN) {
- goto have_block_group;
- } else if (ret > 0) {
- goto loop;
+ bg_ret = NULL;
+ ret = do_allocation(block_group, ffe_ctl, &bg_ret);
+ if (ret == 0) {
+ if (bg_ret && bg_ret != block_group) {
+ btrfs_release_block_group(block_group,
+ ffe_ctl->delalloc);
+ block_group = bg_ret;
}
- /* ret == -ENOENT case falls through */
- }
-
- ret = find_free_extent_unclustered(block_group, last_ptr,
- &ffe_ctl);
- if (ret == -EAGAIN)
+ } else if (ret == -EAGAIN) {
goto have_block_group;
- else if (ret > 0)
+ } else if (ret > 0) {
goto loop;
- /* ret == 0 case falls through */
-checks:
- ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
- fs_info->stripesize);
+ }
+
+ /* Checks */
+ ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+ fs_info->stripesize);
/* move on to the next group */
- if (ffe_ctl.search_start + num_bytes >
+ if (ffe_ctl->search_start + ffe_ctl->num_bytes >
block_group->start + block_group->length) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
- if (ffe_ctl.found_offset < ffe_ctl.search_start)
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ if (ffe_ctl->found_offset < ffe_ctl->search_start)
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl->found_offset,
+ ffe_ctl->search_start - ffe_ctl->found_offset);
- ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
- num_bytes, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+ ffe_ctl->num_bytes,
+ ffe_ctl->delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl->found_offset,
+ ffe_ctl->num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
- ins->objectid = ffe_ctl.search_start;
- ins->offset = num_bytes;
+ ins->objectid = ffe_ctl->search_start;
+ ins->offset = ffe_ctl->num_bytes;
- trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
- num_bytes);
- btrfs_release_block_group(block_group, delalloc);
+ trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+ ffe_ctl->num_bytes);
+ btrfs_release_block_group(block_group, ffe_ctl->delalloc);
break;
loop:
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
- ffe_ctl.index);
- btrfs_release_block_group(block_group, delalloc);
+ release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
- full_search, use_cluster);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4069,12 +4477,12 @@ loop:
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
*/
- if (!ffe_ctl.max_extent_size)
- ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+ if (!ffe_ctl->max_extent_size)
+ ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
spin_lock(&space_info->lock);
- space_info->max_extent_size = ffe_ctl.max_extent_size;
+ space_info->max_extent_size = ffe_ctl->max_extent_size;
spin_unlock(&space_info->lock);
- ins->offset = ffe_ctl.max_extent_size;
+ ins->offset = ffe_ctl->max_extent_size;
} else if (ret == -ENOSPC) {
ret = cache_block_group_error;
}
@@ -4132,15 +4540,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
struct btrfs_key *ins, int is_data, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct find_free_extent_ctl ffe_ctl = {};
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
- hint_byte, ins, flags, delalloc);
+
+ ffe_ctl.ram_bytes = ram_bytes;
+ ffe_ctl.num_bytes = num_bytes;
+ ffe_ctl.min_alloc_size = min_alloc_size;
+ ffe_ctl.empty_size = empty_size;
+ ffe_ctl.flags = flags;
+ ffe_ctl.delalloc = delalloc;
+ ffe_ctl.hint_byte = hint_byte;
+ ffe_ctl.for_treelog = for_treelog;
+ ffe_ctl.for_data_reloc = for_data_reloc;
+
+ ret = find_free_extent(root, ins, &ffe_ctl);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
} else if (ret == -ENOSPC) {
@@ -4158,8 +4579,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu",
- flags, num_bytes);
+ "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+ flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4189,28 +4610,53 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len)
{
struct btrfs_block_group *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(fs_info, start);
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
+ start);
return -ENOSPC;
}
- ret = pin_down_extent(cache, start, len, 1);
+ ret = pin_down_extent(trans, cache, start, len, 1);
btrfs_put_block_group(cache);
return ret;
}
+static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
+ if (ret) {
+ ASSERT(!ret);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ bytenr, num_bytes);
+ return ret;
+ }
+
+ trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
+ return 0;
+}
+
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_inline_ref *iref;
@@ -4230,9 +4676,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- ins, size);
+ extent_root = btrfs_extent_root(fs_info, ins->objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -4265,18 +4710,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- ins->objectid, ins->offset);
- BUG();
- }
- trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
- return ret;
+ return alloc_reserved_extent(trans, ins->objectid, ins->offset);
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
@@ -4284,6 +4718,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_key extent_key;
@@ -4293,7 +4728,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 num_bytes;
u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -4303,21 +4737,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
extent_key.offset = ref->level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
- num_bytes = fs_info->nodesize;
} else {
extent_key.offset = node->num_bytes;
extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
- num_bytes = node->num_bytes;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
- &extent_key, size);
+ extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
+ ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
+ size);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -4341,7 +4773,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
@@ -4354,22 +4785,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, extent_key.objectid,
- num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, 1);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- extent_key.objectid, extent_key.offset);
- BUG();
- }
-
- trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
- fs_info->nodesize);
- return ret;
+ return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -4378,17 +4794,16 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
- int ret;
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
- ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
- ram_bytes, NULL, NULL);
- return ret;
+
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}
/*
@@ -4431,19 +4846,21 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
if (ret)
- btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
+ u64 lockdep_owner = owner;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
return buf;
@@ -4460,12 +4877,35 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
- btrfs_tree_lock(buf);
+ /*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ *
+ * The exception however is in replace_path() in relocation, where we
+ * hold the lock on the original fs root and then search for the reloc
+ * root. At that point we need to make sure any reloc root buffers are
+ * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+ * lockdep happy.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ /* btrfs_clean_tree_block() accesses generation field. */
+ btrfs_set_header_generation(buf, trans->transid);
+
+ /*
+ * This needs to stay, because we could allocate a freed block from an
+ * old tree into a new tree, so we need to make sure this new block is
+ * set to the appropriate level and owner.
+ */
+ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
- btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -4493,7 +4933,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
- trans->dirty = true;
/* this returns a buffer locked for blocking */
return buf;
}
@@ -4507,7 +4946,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4523,7 +4963,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4540,7 +4980,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
@@ -4566,16 +5006,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_key = skinny_metadata ? false : true;
extent_op->update_flags = true;
- extent_op->is_data = false;
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins.objectid, ins.offset, parent);
- generic_ref.real_root = root->root_key.objectid;
- btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+ root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
- extent_op, NULL, NULL);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
goto out_free_delayed;
}
@@ -4584,6 +5022,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
@@ -4685,7 +5124,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(fs_info, bytenr);
+ btrfs_readahead_node_child(eb, slot);
nread++;
}
wc->reada_slot = slot;
@@ -4750,9 +5189,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, eb->start,
- eb->len, flag,
- btrfs_header_level(eb), 0);
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
+ btrfs_header_level(eb));
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -4845,16 +5283,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ root->root_key.objectid, level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
-
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
- level - 1);
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -4905,8 +5340,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation, level - 1,
- &first_key);
+ next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
+ generation, level - 1, &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -4914,7 +5349,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
}
level--;
@@ -4926,7 +5360,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
path->nodes[level] = next;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
@@ -4994,7 +5428,8 @@ skip:
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
fs_info->nodesize, parent);
- btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+ 0, false);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
@@ -5054,8 +5489,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
@@ -5098,8 +5532,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
}
btrfs_clean_tree_block(eb);
}
@@ -5117,7 +5550,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
@@ -5209,10 +5643,10 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*
* If called with for_reloc == 0, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref,
- int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
+ const bool is_reloc_root = (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5224,6 +5658,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int ret;
int level;
bool root_dropped = false;
+ bool unfinished_drop = false;
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
@@ -5240,7 +5675,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction start. See
+ * wait_reserve_ticket and the whole reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5250,9 +5692,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (err)
goto out_end_trans;
- if (block_rsv)
- trans->block_rsv = block_rsv;
-
/*
* This will help us catch people modifying the fs tree while we're
* dropping it. It is unsafe to mess with the fs tree while it's being
@@ -5262,12 +5701,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
+ unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
memset(&wc->update_progress, 0,
sizeof(wc->update_progress));
} else {
@@ -5275,7 +5715,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
memcpy(&wc->update_progress, &key,
sizeof(wc->update_progress));
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5295,8 +5735,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking_write(path->nodes[level]);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
@@ -5308,7 +5747,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
BUG_ON(wc->refs[level] == 0);
- if (level == root_item->drop_level)
+ if (level == btrfs_root_drop_level(root_item))
break;
btrfs_tree_unlock(path->nodes[level]);
@@ -5353,7 +5792,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
btrfs_cpu_key_to_disk(&root_item->drop_progress,
&wc->drop_progress);
- root_item->drop_level = wc->drop_level;
+ btrfs_set_root_drop_level(root_item, wc->drop_level);
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
@@ -5367,6 +5806,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
@@ -5375,13 +5817,19 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_free;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction
+ * start. See wait_reserve_ticket and the whole
+ * reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
}
- if (block_rsv)
- trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
@@ -5395,7 +5843,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
@@ -5413,21 +5861,36 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+ /*
+ * This subvolume is going to be completely dropped, and won't be
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
+ * commit transaction time. So free it here manually.
+ */
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
+ btrfs_qgroup_free_meta_all_pertrans(root);
+
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
- } else {
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- btrfs_put_fs_root(root);
- }
+ else
+ btrfs_put_root(root);
root_dropped = true;
out_end_trans:
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
btrfs_free_path(path);
out:
/*
+ * We were an unfinished drop root, check to see if there are any
+ * pending, and if not clear and wake up any waiters.
+ */
+ if (!err && unfinished_drop)
+ btrfs_maybe_wake_unfinished_drop(fs_info);
+
+ /*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
* keep trying to do the work later. This also cleans up roots if we
@@ -5436,8 +5899,6 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- if (err && err != -EAGAIN)
- btrfs_handle_fs_error(fs_info, err, NULL);
return err;
}
@@ -5472,17 +5933,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- btrfs_assert_tree_locked(parent);
+ btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
- btrfs_assert_tree_locked(node);
+ btrfs_assert_tree_write_locked(node);
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->refs[parent_level] = 1;
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -5574,13 +6035,13 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
*/
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
- u64 start = SZ_1M, len = 0, end = 0;
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
*trimmed = 0;
/* Discard not supported = nothing to do. */
- if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ if (!bdev_max_discard_sectors(device->bdev))
return 0;
/* Not writable = nothing to do. */
@@ -5605,8 +6066,21 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
- /* Ensure we skip the reserved area in the first 1M */
- start = max_t(u64, start, SZ_1M);
+ /* Check if there are any CHUNK_* bits left */
+ if (start > device->total_bytes) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_in_rcu(fs_info,
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
+ start, end - start + 1,
+ rcu_str_deref(device->name),
+ device->total_bytes);
+ mutex_unlock(&fs_info->chunk_mutex);
+ ret = 0;
+ break;
+ }
+
+ /* Ensure we skip the reserved space on each device. */
+ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
/*
* If find_first_clear_extent_bit find a range that spans the
@@ -5660,9 +6134,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
- struct list_head *devices;
u64 group_trimmed;
u64 range_end = U64_MAX;
u64 start;
@@ -5674,6 +6148,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
int dev_ret = 0;
int ret = 0;
+ if (range->start == U64_MAX)
+ return -EINVAL;
+
/*
* Check range overflow if range->len is set.
* The default range->len is U64_MAX.
@@ -5694,13 +6171,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
- ret = btrfs_cache_block_group(cache, 0);
- if (ret) {
- bg_failed++;
- bg_ret = ret;
- continue;
- }
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
bg_ret = ret;
@@ -5726,9 +6197,12 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
btrfs_warn(fs_info,
"failed to trim %llu block group(s), last error %d",
bg_failed, bg_ret);
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
@@ -5738,7 +6212,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
if (dev_failed)
btrfs_warn(fs_info,
@@ -5749,47 +6223,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
return bg_ret;
return dev_ret;
}
-
-/*
- * btrfs_{start,end}_write_no_snapshotting() are similar to
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * data into the page cache through nocow before the subvolume is snapshoted,
- * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshotting is ongoing and that cause the snapshot to be
- * inconsistent (writes followed by expanding truncates for example).
- */
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
-{
- percpu_counter_dec(&root->subv_writers->counter);
- cond_wake_up(&root->subv_writers->wait);
-}
-
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
-{
- if (atomic_read(&root->will_be_snapshotted))
- return 0;
-
- percpu_counter_inc(&root->subv_writers->counter);
- /*
- * Make sure counter is updated before we check for snapshot creation.
- */
- smp_mb();
- if (atomic_read(&root->will_be_snapshotted)) {
- btrfs_end_write_no_snapshotting(root);
- return 0;
- }
- return 1;
-}
-
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
-{
- while (true) {
- int ret;
-
- ret = btrfs_start_write_no_snapshotting(root);
- if (ret)
- break;
- wait_var_event(&root->will_be_snapshotted,
- !atomic_read(&root->will_be_snapshotted));
- }
-}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c0f202741e09..4dcf22e051ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -6,13 +6,15 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
+#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
-#include <linux/cleancache.h>
+#include <linux/fsverity.h>
+#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
@@ -24,105 +26,80 @@
#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "block-group.h"
+#include "compression.h"
-static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set btrfs_bioset;
-
-static inline bool extent_state_in_tree(const struct extent_state *state)
-{
- return !RB_EMPTY_NODE(&state->rb_node);
-}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(buffers);
-static LIST_HEAD(states);
-
-static DEFINE_SPINLOCK(leak_lock);
-
-static inline
-void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
- list_add(new, head);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_add(&eb->leak_list, &fs_info->allocated_ebs);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-static inline
-void btrfs_leak_debug_del(struct list_head *entry)
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
- list_del(entry);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-static inline void btrfs_extent_buffer_leak_debug_check(void)
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
+ unsigned long flags;
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
- list_del(&eb->leak_list);
- kmem_cache_free(extent_buffer_cache, eb);
- }
-}
-
-static inline void btrfs_extent_state_leak_debug_check(void)
-{
- struct extent_state *state;
-
- while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
- state->start, state->end, state->state,
- extent_state_in_tree(state),
- refcount_read(&state->refs));
- list_del(&state->leak_list);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-#define btrfs_debug_check_extent_io_range(tree, start, end) \
- __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
-static inline void __btrfs_debug_check_extent_io_range(const char *caller,
- struct extent_io_tree *tree, u64 start, u64 end)
-{
- struct inode *inode = tree->private_data;
- u64 isize;
-
- if (!inode || !is_data_inode(inode))
+ /*
+ * If we didn't get into open_ctree our allocated_ebs will not be
+ * initialized, so just skip this.
+ */
+ if (!fs_info->allocated_ebs.next)
return;
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ WARN_ON(!list_empty(&fs_info->allocated_ebs));
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ while (!list_empty(&fs_info->allocated_ebs)) {
+ eb = list_first_entry(&fs_info->allocated_ebs,
+ struct extent_buffer, leak_list);
+ pr_err(
+ "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_header_owner(eb));
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
}
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
#else
-#define btrfs_leak_debug_add(new, head) do {} while (0)
-#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
-#define btrfs_extent_state_leak_debug_check() do {} while (0)
-#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#define btrfs_leak_debug_add_eb(eb) do {} while (0)
+#define btrfs_leak_debug_del_eb(eb) do {} while (0)
#endif
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
+/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ int mirror_num;
+ enum btrfs_compression_type compress_type;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+ btrfs_bio_end_io_t end_io_func;
};
struct extent_page_data {
- struct bio *bio;
- struct extent_io_tree *tree;
+ struct btrfs_bio_ctrl bio_ctrl;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -132,86 +109,59 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, unsigned bits,
- struct extent_changeset *changeset,
- int set)
+static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
- int ret;
+ struct bio *bio;
+ struct bio_vec *bv;
+ struct inode *inode;
+ int mirror_num;
- if (!changeset)
- return 0;
- if (set && (state->state & bits) == bits)
- return 0;
- if (!set && (state->state & bits) == 0)
- return 0;
- changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(&changeset->range_changed, state->start, state->end,
- GFP_ATOMIC);
- return ret;
-}
+ if (!bio_ctrl->bio)
+ return;
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct extent_io_tree *tree = bio->bi_private;
+ bio = bio_ctrl->bio;
+ bv = bio_first_bvec_all(bio);
+ inode = bv->bv_page->mapping->host;
+ mirror_num = bio_ctrl->mirror_num;
+
+ /* Caller should ensure the bio has at least some range added */
+ ASSERT(bio->bi_iter.bi_size);
- bio->bi_private = NULL;
+ btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (!is_data_inode(inode))
+ btrfs_submit_metadata_bio(inode, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(inode, bio, mirror_num);
else
- btrfsic_submit_bio(bio);
+ btrfs_submit_data_read_bio(inode, bio, mirror_num,
+ bio_ctrl->compress_type);
- return blk_status_to_errno(ret);
-}
-
-/* Cleanup unsubmitted bios */
-static void end_write_bio(struct extent_page_data *epd, int ret)
-{
- if (epd->bio) {
- epd->bio->bi_status = errno_to_blk_status(ret);
- bio_endio(epd->bio);
- epd->bio = NULL;
- }
+ /* The bio is owned by the end_io handler now */
+ bio_ctrl->bio = NULL;
}
/*
- * Submit bio from extent page data via submit_one_bio
- *
- * Return 0 if everything is OK.
- * Return <0 for error.
+ * Submit or fail the current bio in an extent_page_data structure.
*/
-static int __must_check flush_write_bio(struct extent_page_data *epd)
+static void submit_write_bio(struct extent_page_data *epd, int ret)
{
- int ret = 0;
+ struct bio *bio = epd->bio_ctrl.bio;
- if (epd->bio) {
- ret = submit_one_bio(epd->bio, 0, 0);
- /*
- * Clean up of epd->bio is handled by its endio function.
- * And endio is either triggered by successful bio execution
- * or the error handler of submit bio hook.
- * So at this point, no matter what happened, we don't need
- * to clean up epd->bio.
- */
- epd->bio = NULL;
- }
- return ret;
-}
+ if (!bio)
+ return;
-int __init extent_state_cache_init(void)
-{
- extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!extent_state_cache)
- return -ENOMEM;
- return 0;
+ if (ret) {
+ ASSERT(ret < 0);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ /* The bio is owned by the end_io handler now */
+ epd->bio_ctrl.bio = NULL;
+ } else {
+ submit_one_bio(&epd->bio_ctrl);
+ }
}
-int __init extent_io_init(void)
+int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
@@ -219,1246 +169,17 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
return -ENOMEM;
- if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_io_bio, bio),
- BIOSET_NEED_BVECS))
- goto free_buffer_cache;
-
- if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
- goto free_bioset;
-
return 0;
-
-free_bioset:
- bioset_exit(&btrfs_bioset);
-
-free_buffer_cache:
- kmem_cache_destroy(extent_buffer_cache);
- extent_buffer_cache = NULL;
- return -ENOMEM;
-}
-
-void __cold extent_state_cache_exit(void)
-{
- btrfs_extent_state_leak_debug_check();
- kmem_cache_destroy(extent_state_cache);
}
-void __cold extent_io_exit(void)
+void __cold extent_buffer_free_cachep(void)
{
- btrfs_extent_buffer_leak_debug_check();
-
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
- bioset_exit(&btrfs_bioset);
-}
-
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data)
-{
- tree->fs_info = fs_info;
- tree->state = RB_ROOT;
- tree->ops = NULL;
- tree->dirty_bytes = 0;
- spin_lock_init(&tree->lock);
- tree->private_data = private_data;
- tree->owner = owner;
-}
-
-void extent_io_tree_release(struct extent_io_tree *tree)
-{
- spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- /*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
- */
- ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
-
- cond_resched_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
-}
-
-static struct extent_state *alloc_extent_state(gfp_t mask)
-{
- struct extent_state *state;
-
- /*
- * The given mask might be not appropriate for the slab allocator,
- * drop the unsupported bits
- */
- mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
- state = kmem_cache_alloc(extent_state_cache, mask);
- if (!state)
- return state;
- state->state = 0;
- state->failrec = NULL;
- RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&state->leak_list, &states);
- refcount_set(&state->refs, 1);
- init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
- return state;
-}
-
-void free_extent_state(struct extent_state *state)
-{
- if (!state)
- return;
- if (refcount_dec_and_test(&state->refs)) {
- WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&state->leak_list);
- trace_free_extent_state(state, _RET_IP_);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-static struct rb_node *tree_insert(struct rb_root *root,
- struct rb_node *search_start,
- u64 offset,
- struct rb_node *node,
- struct rb_node ***p_in,
- struct rb_node **parent_in)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- if (p_in && parent_in) {
- p = *p_in;
- parent = *parent_in;
- goto do_insert;
- }
-
- p = search_start ? &search_start : &root->rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- p = &(*p)->rb_left;
- else if (offset > entry->end)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
-do_insert:
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
-/**
- * __etree_search - searche @tree for an entry that contains @offset. Such
- * entry would have entry->start <= offset && entry->end >= offset.
- *
- * @tree - the tree to search
- * @offset - offset that should fall within an entry in @tree
- * @next_ret - pointer to the first entry whose range ends after @offset
- * @prev - pointer to the first entry whose range begins before @offset
- * @p_ret - pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret - points to entry which would have been the parent of the entry,
- * containing @offset
- *
- * This function returns a pointer to the entry that contains @offset byte
- * address. If no such entry exists, then NULL is returned and the other
- * pointer arguments to the function are filled, otherwise the found entry is
- * returned and other pointers are left untouched.
- */
-static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **next_ret,
- struct rb_node **prev_ret,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **n = &root->rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
- struct tree_entry *entry;
- struct tree_entry *prev_entry = NULL;
-
- while (*n) {
- prev = *n;
- entry = rb_entry(prev, struct tree_entry, rb_node);
- prev_entry = entry;
-
- if (offset < entry->start)
- n = &(*n)->rb_left;
- else if (offset > entry->end)
- n = &(*n)->rb_right;
- else
- return *n;
- }
-
- if (p_ret)
- *p_ret = n;
- if (parent_ret)
- *parent_ret = prev;
-
- if (next_ret) {
- orig_prev = prev;
- while (prev && offset > prev_entry->end) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
- }
-
- if (prev_ret) {
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
- }
- return NULL;
-}
-
-static inline struct rb_node *
-tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
-{
- struct rb_node *next= NULL;
- struct rb_node *ret;
-
- ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
- if (!ret)
- return next;
- return ret;
-}
-
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
- u64 offset)
-{
- return tree_search_for_insert(tree, offset, NULL, NULL);
-}
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree. Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static void merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
-{
- struct extent_state *other;
- struct rb_node *other_node;
-
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- return;
-
- other_node = rb_prev(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
- other_node = rb_next(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits,
- struct extent_changeset *changeset);
-
-/*
- * insert an extent_state struct into the tree. 'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally. This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state, u64 start, u64 end,
- struct rb_node ***p,
- struct rb_node **parent,
- unsigned *bits, struct extent_changeset *changeset)
-{
- struct rb_node *node;
-
- if (end < start) {
- btrfs_err(tree->fs_info,
- "insert state: end < start %llu %llu", end, start);
- WARN_ON(1);
- }
- state->start = start;
- state->end = end;
-
- set_state_bits(tree, state, bits, changeset);
-
- node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
- if (node) {
- struct extent_state *found;
- found = rb_entry(node, struct extent_state, rb_node);
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- found->start, found->end, start, end);
- return -EEXIST;
- }
- merge_state(tree, state);
- return 0;
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half. 'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end]. After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
- struct extent_state *prealloc, u64 split)
-{
- struct rb_node *node;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_split_delalloc_extent(tree->private_data, orig, split);
-
- prealloc->start = orig->start;
- prealloc->end = split - 1;
- prealloc->state = orig->state;
- orig->start = split;
-
- node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
- &prealloc->rb_node, NULL, NULL);
- if (node) {
- free_extent_state(prealloc);
- return -EEXIST;
- }
- return 0;
-}
-
-static struct extent_state *next_state(struct extent_state *state)
-{
- struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up anyone waiting on this state (wake == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned *bits, int wake,
- struct extent_changeset *changeset)
-{
- struct extent_state *next;
- unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
- int ret;
-
- if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- WARN_ON(range > tree->dirty_bytes);
- tree->dirty_bytes -= range;
- }
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_clear_delalloc_extent(tree->private_data, state, bits);
-
- ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
- BUG_ON(ret < 0);
- state->state &= ~bits_to_clear;
- if (wake)
- wake_up(&state->wq);
- if (state->state == 0) {
- next = next_state(state);
- if (extent_state_in_tree(state)) {
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
- } else {
- WARN_ON(1);
- }
- } else {
- merge_state(tree, state);
- next = next_state(state);
- }
- return next;
-}
-
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
-{
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
-
- return prealloc;
-}
-
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
-{
- struct inode *inode = tree->private_data;
-
- btrfs_panic(btrfs_sb(inode->i_sb), err,
- "locking error: extent tree was modified by another thread while locked");
-}
-
-/*
- * clear some bits on a range in the tree. This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns 0 on success and < 0 on error.
- */
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *cached;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- u64 last_end;
- int err;
- int clear = 0;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
-
- if (bits & EXTENT_DELALLOC)
- bits |= EXTENT_NORESERVE;
-
- if (delete)
- bits |= ~EXTENT_CTLBITS;
-
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- clear = 1;
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state) {
- cached = *cached_state;
-
- if (clear) {
- *cached_state = NULL;
- cached_state = NULL;
- }
-
- if (cached && extent_state_in_tree(cached) &&
- cached->start <= start && cached->end > start) {
- if (clear)
- refcount_dec(&cached->refs);
- state = cached;
- goto hit_next;
- }
- if (clear)
- free_extent_state(cached);
- }
- /*
- * this search will find the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- if (state->start > end)
- goto out;
- WARN_ON(state->end < start);
- last_end = state->end;
-
- /* the state doesn't have the wanted bits, go ahead */
- if (!(state->state & bits)) {
- state = next_state(state);
- goto next;
- }
-
- /*
- * | ---- desired range ---- |
- * | state | or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip
- * bits on second half.
- *
- * If the extent we found extends past our range, we
- * just split and search again. It'll get split again
- * the next time though.
- *
- * If the extent we found is inside our range, we clear
- * the desired bit on it.
- */
-
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake,
- changeset);
- goto next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and clear the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- if (wake)
- wake_up(&state->wq);
-
- clear_state_bit(tree, prealloc, &bits, wake, changeset);
-
- prealloc = NULL;
- goto out;
- }
-
- state = clear_state_bit(tree, state, &bits, wake, changeset);
-next:
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start <= end && state && !need_resched())
- goto hit_next;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
-
-}
-
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits)
-{
- struct extent_state *state;
- struct rb_node *node;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
-
- spin_lock(&tree->lock);
-again:
- while (1) {
- /*
- * this search will find all the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
-process_node:
- if (!node)
- break;
-
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (state->start > end)
- goto out;
-
- if (state->state & bits) {
- start = state->start;
- refcount_inc(&state->refs);
- wait_on_state(tree, state);
- free_extent_state(state);
- goto again;
- }
- start = state->end + 1;
-
- if (start > end)
- break;
-
- if (!cond_resched_lock(&tree->lock)) {
- node = rb_next(node);
- goto process_node;
- }
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned *bits, struct extent_changeset *changeset)
-{
- unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
- int ret;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_set_delalloc_extent(tree->private_data, state, bits);
-
- if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- tree->dirty_bytes += range;
- }
- ret = add_extent_changeset(state, bits_to_set, changeset, 1);
- BUG_ON(ret < 0);
- state->state |= bits_to_set;
-}
-
-static void cache_state_if_flags(struct extent_state *state,
- struct extent_state **cached_ptr,
- unsigned flags)
-{
- if (cached_ptr && !(*cached_ptr)) {
- if (!flags || (state->state & flags)) {
- *cached_ptr = state;
- refcount_inc(&state->refs);
- }
- }
-}
-
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
-{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
-}
-
-/*
- * set some bits on a range in the tree. This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set. The start of the
- * existing range is returned in failed_start in this case.
- *
- * [start, end] is inclusive This takes the tree lock.
- */
-
-static int __must_check
-__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned exclusive_bits,
- u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
-
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- if (state->state & exclusive_bits) {
- *failed_start = state->start;
- err = -EEXIST;
- goto out;
- }
-
- set_state_bits(tree, state, &bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, &bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, &bits, changeset);
- cache_state(prealloc, cached_state);
- merge_state(tree, prealloc);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-
-}
-
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 * failed_start,
- struct extent_state **cached_state, gfp_t mask)
-{
- return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask, NULL);
-}
-
-
-/**
- * convert_extent_bit - convert all bits in a given range from one bit to
- * another
- * @tree: the io tree to search
- * @start: the start offset in bytes
- * @end: the end offset in bytes (inclusive)
- * @bits: the bits to set in this range
- * @clear_bits: the bits to clear in this range
- * @cached_state: state that we're going to cache
- *
- * This will go through and set bits for the given range. If any states exist
- * already in this range they are set with the given bit and cleared of the
- * clear_bits. This is only meant to be used by things that are mergeable, ie
- * converting from say DELALLOC to DIRTY. This is not meant to be used with
- * boundary bits like LOCK.
- *
- * All allocations are done with GFP_NOFS.
- */
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
- bool first_iteration = true;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
- clear_bits);
-
-again:
- if (!prealloc) {
- /*
- * Best effort, don't worry if extent state allocation fails
- * here for the first iteration. We might have a cached state
- * that matches exactly the target range, in which case no
- * extent state allocations are needed. We'll only know this
- * after locking the tree.
- */
- prealloc = alloc_extent_state(GFP_NOFS);
- if (!prealloc && !first_iteration)
- return -ENOMEM;
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0,
- NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, &bits, NULL);
- cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- cond_resched();
- first_iteration = false;
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-}
-
-/* wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
-{
- /*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
-}
-
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits)
-{
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
-}
-
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, wake, delete,
- cached, GFP_NOFS, NULL);
-}
-
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
-{
- /*
- * Don't support EXTENT_LOCKED case, same reason as
- * set_record_extent_bits().
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
- changeset);
-}
-
-/*
- * either insert or lock state struct between start and end use mask to tell
- * us if waiting is desired.
- */
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state)
-{
- int err;
- u64 failed_start;
-
- while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
- start = failed_start;
- } else
- break;
- WARN_ON(start > end);
- }
- return err;
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- int err;
- u64 failed_start;
-
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL);
- return 0;
- }
- return 1;
}
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
@@ -1478,276 +199,142 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
+ struct address_space *mapping = inode->i_mapping;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- __set_page_dirty_nobuffers(page);
- account_page_redirty(page);
- put_page(page);
- index++;
+ folio = filemap_get_folio(mapping, index);
+ filemap_dirty_folio(mapping, folio);
+ folio_account_redirty(folio);
+ index += folio_nr_pages(folio);
+ folio_put(folio);
}
}
-/* find the first state struct with 'bits' set after 'start', and
- * return it. tree->lock must be held. NULL will returned if
- * nothing was found after 'start'
+/*
+ * Process one page for __process_pages_contig().
+ *
+ * Return >0 if we hit @page == @locked_page.
+ * Return 0 if we updated the page status.
+ * Return -EGAIN if the we need to try again.
+ * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
*/
-static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned bits)
+static int process_one_page(struct btrfs_fs_info *fs_info,
+ struct address_space *mapping,
+ struct page *page, struct page *locked_page,
+ unsigned long page_ops, u64 start, u64 end)
{
- struct rb_node *node;
- struct extent_state *state;
+ u32 len;
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
+ ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
+ len = end + 1 - start;
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && (state->state & bits))
- return state;
+ if (page_ops & PAGE_SET_ORDERED)
+ btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ if (page_ops & PAGE_SET_ERROR)
+ btrfs_page_clamp_set_error(fs_info, page, start, len);
+ if (page_ops & PAGE_START_WRITEBACK) {
+ btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
+ btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ }
+ if (page_ops & PAGE_END_WRITEBACK)
+ btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
- node = rb_next(node);
- if (!node)
- break;
+ if (page == locked_page)
+ return 1;
+
+ if (page_ops & PAGE_LOCK) {
+ int ret;
+
+ ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
+ if (ret)
+ return ret;
+ if (!PageDirty(page) || page->mapping != mapping) {
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return -EAGAIN;
+ }
}
-out:
- return NULL;
+ if (page_ops & PAGE_UNLOCK)
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return 0;
}
-/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
- *
- * If nothing was found, 1 is returned. If found something, return 0.
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
- struct extent_state **cached_state)
+static int __process_pages_contig(struct address_space *mapping,
+ struct page *locked_page,
+ u64 start, u64 end, unsigned long page_ops,
+ u64 *processed_end)
{
- struct extent_state *state;
- int ret = 1;
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ unsigned long pages_processed = 0;
+ struct folio_batch fbatch;
+ int err = 0;
+ int i;
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->end == start - 1 && extent_state_in_tree(state)) {
- while ((state = next_state(state)) != NULL) {
- if (state->state & bits)
- goto got_it;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- goto out;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
+ if (page_ops & PAGE_LOCK) {
+ ASSERT(page_ops == PAGE_LOCK);
+ ASSERT(processed_end && *processed_end == start);
}
- state = find_first_extent_bit_state(tree, start, bits);
-got_it:
- if (state) {
- cache_state_if_flags(state, cached_state, 0);
- *start_ret = state->start;
- *end_ret = state->end;
- ret = 0;
- }
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
+ if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
+ mapping_set_error(mapping, -EIO);
-/**
- * find_first_clear_extent_bit - find the first range that has @bits not set.
- * This range could start before @start.
- *
- * @tree - the tree to search
- * @start - the offset at/after which the found extent should start
- * @start_ret - records the beginning of the range
- * @end_ret - records the end of the range (inclusive)
- * @bits - the set of bits which must be unset
- *
- * Since unallocated range is also considered one which doesn't have the bits
- * set it's possible that @end_ret contains -1, this happens in case the range
- * spans (last_range_end, end of device]. In this case it's up to the caller to
- * trim @end_ret to the appropriate size.
- */
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
-{
- struct extent_state *state;
- struct rb_node *node, *prev = NULL, *next;
+ folio_batch_init(&fbatch);
+ while (index <= end_index) {
+ int found_folios;
- spin_lock(&tree->lock);
+ found_folios = filemap_get_folios_contig(mapping, &index,
+ end_index, &fbatch);
- /* Find first extent with bits cleared */
- while (1) {
- node = __etree_search(tree, start, &next, &prev, NULL, NULL);
- if (!node && !next && !prev) {
- /*
- * Tree is completely empty, send full range and let
- * caller deal with it
- */
- *start_ret = 0;
- *end_ret = -1;
- goto out;
- } else if (!node && !next) {
+ if (found_folios == 0) {
/*
- * We are past the last allocated chunk, set start at
- * the end of the last extent.
+ * Only if we're going to lock these pages, we can find
+ * nothing at @index.
*/
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
+ ASSERT(page_ops & PAGE_LOCK);
+ err = -EAGAIN;
goto out;
- } else if (!node) {
- node = next;
}
- /*
- * At this point 'node' either contains 'start' or start is
- * before 'node'
- */
- state = rb_entry(node, struct extent_state, rb_node);
- if (in_range(start, state->start, state->end - state->start + 1)) {
- if (state->state & bits) {
- /*
- * |--range with bits sets--|
- * |
- * start
- */
- start = state->end + 1;
- } else {
- /*
- * 'start' falls within a range that doesn't
- * have the bits set, so take its start as
- * the beginning of the desired range
- *
- * |--range with bits cleared----|
- * |
- * start
- */
- *start_ret = state->start;
- break;
- }
- } else {
- /*
- * |---prev range---|---hole/unset---|---node range---|
- * |
- * start
- *
- * or
- *
- * |---hole/unset--||--first node--|
- * 0 |
- * start
- */
- if (prev) {
- state = rb_entry(prev, struct extent_state,
- rb_node);
- *start_ret = state->end + 1;
- } else {
- *start_ret = 0;
+ for (i = 0; i < found_folios; i++) {
+ int process_ret;
+ struct folio *folio = fbatch.folios[i];
+ process_ret = process_one_page(fs_info, mapping,
+ &folio->page, locked_page, page_ops,
+ start, end);
+ if (process_ret < 0) {
+ err = -EAGAIN;
+ folio_batch_release(&fbatch);
+ goto out;
}
- break;
+ pages_processed += folio_nr_pages(folio);
}
- }
-
- /*
- * Find the longest stretch from start until an entry which has the
- * bits set
- */
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && !(state->state & bits)) {
- *end_ret = state->end;
- } else {
- *end_ret = state->start - 1;
- break;
- }
-
- node = rb_next(node);
- if (!node)
- break;
+ folio_batch_release(&fbatch);
+ cond_resched();
}
out:
- spin_unlock(&tree->lock);
-}
-
-/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'. start and end are used to return the range,
- *
- * true is returned if we find something, false if nothing was in the tree
- */
-bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
- u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
-{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- bool found = false;
- u64 total_bytes = 0;
-
- spin_lock(&tree->lock);
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node) {
- *end = (u64)-1;
- goto out;
- }
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (found && (state->start != cur_start ||
- (state->state & EXTENT_BOUNDARY))) {
- goto out;
- }
- if (!(state->state & EXTENT_DELALLOC)) {
- if (!found)
- *end = state->end;
- goto out;
- }
- if (!found) {
- *start = state->start;
- *cached_state = state;
- refcount_inc(&state->refs);
- }
- found = true;
- *end = state->end;
- cur_start = state->end + 1;
- node = rb_next(node);
- total_bytes += state->end - state->start + 1;
- if (total_bytes >= max_bytes)
- break;
- if (!node)
- break;
+ if (err && processed_end) {
+ /*
+ * Update @processed_end. I know this is awful since it has
+ * two different return value patterns (inclusive vs exclusive).
+ *
+ * But the exclusive pattern is necessary if @start is 0, or we
+ * underflow and check against processed_end won't work as
+ * expected.
+ */
+ if (pages_processed)
+ *processed_end = min(end,
+ ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
+ else
+ *processed_end = start;
}
-out:
- spin_unlock(&tree->lock);
- return found;
+ return err;
}
-static int __process_pages_contig(struct address_space *mapping,
- struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret);
-
static noinline void __unlock_for_delalloc(struct inode *inode,
struct page *locked_page,
u64 start, u64 end)
@@ -1759,7 +346,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
if (index == locked_page->index && end_index == index)
return;
- __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
+ __process_pages_contig(inode->i_mapping, locked_page, start, end,
PAGE_UNLOCK, NULL);
}
@@ -1769,36 +356,48 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 delalloc_end)
{
unsigned long index = delalloc_start >> PAGE_SHIFT;
- unsigned long index_ret = index;
unsigned long end_index = delalloc_end >> PAGE_SHIFT;
+ u64 processed_end = delalloc_start;
int ret;
ASSERT(locked_page);
if (index == locked_page->index && index == end_index)
return 0;
- ret = __process_pages_contig(inode->i_mapping, locked_page, index,
- end_index, PAGE_LOCK, &index_ret);
- if (ret == -EAGAIN)
+ ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
+ delalloc_end, PAGE_LOCK, &processed_end);
+ if (ret == -EAGAIN && processed_end > delalloc_start)
__unlock_for_delalloc(inode, locked_page, delalloc_start,
- (u64)index_ret << PAGE_SHIFT);
+ processed_end);
return ret;
}
/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes. @Start and @end are used to return the range,
+ * more than @max_bytes.
+ *
+ * @start: The original start bytenr to search.
+ * Will store the extent range start bytenr.
+ * @end: The original end bytenr of the search range
+ * Will store the extent range end bytenr.
*
- * Return: true if we find something
- * false if nothing was in the tree
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
+ /* The sanity tests may not set a valid fs_info. */
+ u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
bool found;
@@ -1806,15 +405,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
int ret;
int loops = 0;
+ /* Caller should pass a valid @end to indicate the search range end */
+ ASSERT(orig_end > orig_start);
+
+ /* The range should at least cover part of the page */
+ ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+ orig_end <= page_offset(locked_page)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
- if (!found || delalloc_end <= *start) {
+ if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
*start = delalloc_start;
- *end = delalloc_end;
+
+ /* @delalloc_end can be -1, never go beyond @orig_end */
+ *end = min(delalloc_end, orig_end);
free_extent_state(cached_state);
return false;
}
@@ -1854,14 +461,14 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
+ lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent_cached(tree, delalloc_start, delalloc_end,
- &cached_state);
+ unlock_extent(tree, delalloc_start, delalloc_end,
+ &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
@@ -1874,306 +481,50 @@ out_failed:
return found;
}
-static int __process_pages_contig(struct address_space *mapping,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret)
+ u32 clear_bits, unsigned long page_ops)
{
- unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_locked = 0;
- pgoff_t index = start_index;
- struct page *pages[16];
- unsigned ret;
- int err = 0;
- int i;
-
- if (page_ops & PAGE_LOCK) {
- ASSERT(page_ops == PAGE_LOCK);
- ASSERT(index_ret && *index_ret == start_index);
- }
-
- if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
- mapping_set_error(mapping, -EIO);
-
- while (nr_pages > 0) {
- ret = find_get_pages_contig(mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- /*
- * Only if we're going to lock these pages,
- * can we find nothing at @index.
- */
- ASSERT(page_ops & PAGE_LOCK);
- err = -EAGAIN;
- goto out;
- }
-
- for (i = 0; i < ret; i++) {
- if (page_ops & PAGE_SET_PRIVATE2)
- SetPagePrivate2(pages[i]);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
- if (locked_page && pages[i] == locked_page) {
- put_page(pages[i]);
- pages_locked++;
- continue;
- }
- if (page_ops & PAGE_CLEAR_DIRTY)
- clear_page_dirty_for_io(pages[i]);
- if (page_ops & PAGE_SET_WRITEBACK)
- set_page_writeback(pages[i]);
- if (page_ops & PAGE_SET_ERROR)
- SetPageError(pages[i]);
- if (page_ops & PAGE_END_WRITEBACK)
- end_page_writeback(pages[i]);
- if (page_ops & PAGE_UNLOCK)
- unlock_page(pages[i]);
- if (page_ops & PAGE_LOCK) {
- lock_page(pages[i]);
- if (!PageDirty(pages[i]) ||
- pages[i]->mapping != mapping) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- err = -EAGAIN;
- goto out;
- }
- }
- put_page(pages[i]);
- pages_locked++;
- }
- nr_pages -= ret;
- index += ret;
- cond_resched();
- }
-out:
- if (err && index_ret)
- *index_ret = start_index + pages_locked - 1;
- return err;
+ __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
+ start, end, page_ops, NULL);
}
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
- struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+static int insert_failrec(struct btrfs_inode *inode,
+ struct io_failure_record *failrec)
{
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
- NULL);
-
- __process_pages_contig(inode->i_mapping, locked_page,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT,
- page_ops, NULL);
-}
-
-/*
- * count the number of bytes in the tree that have a given bit(s)
- * set. This can be fairly slow, except for EXTENT_DIRTY which is
- * cached. The total number found is returned.
- */
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- unsigned bits, int contig)
-{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- u64 total_bytes = 0;
- u64 last = 0;
- int found = 0;
-
- if (WARN_ON(search_end <= cur_start))
- return 0;
-
- spin_lock(&tree->lock);
- if (cur_start == 0 && bits == EXTENT_DIRTY) {
- total_bytes = tree->dirty_bytes;
- goto out;
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start > search_end)
- break;
- if (contig && found && state->start > last + 1)
- break;
- if (state->end >= cur_start && (state->state & bits) == bits) {
- total_bytes += min(search_end, state->end) + 1 -
- max(cur_start, state->start);
- if (total_bytes >= max_bytes)
- break;
- if (!found) {
- *start = max(cur_start, state->start);
- found = 1;
- }
- last = state->end;
- } else if (contig && found) {
- break;
- }
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return total_bytes;
-}
-
-/*
- * set the private field for a given byte offset in the tree. If there isn't
- * an extent_state there already, this does nothing.
- */
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
-{
- struct rb_node *node;
- struct extent_state *state;
- int ret = 0;
-
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- ret = -ENOENT;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- ret = -ENOENT;
- goto out;
- }
- state->failrec = failrec;
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
+ struct rb_node *exist;
-int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
-{
- struct rb_node *node;
- struct extent_state *state;
- int ret = 0;
+ spin_lock(&inode->io_failure_lock);
+ exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+ &failrec->rb_node);
+ spin_unlock(&inode->io_failure_lock);
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- ret = -ENOENT;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- ret = -ENOENT;
- goto out;
- }
- *failrec = state->failrec;
-out:
- spin_unlock(&tree->lock);
- return ret;
+ return (exist == NULL) ? 0 : -EEXIST;
}
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if every extent in the tree
- * has the bits set. Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled, struct extent_state *cached)
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
{
- struct extent_state *state = NULL;
struct rb_node *node;
- int bitset = 0;
-
- spin_lock(&tree->lock);
- if (cached && extent_state_in_tree(cached) && cached->start <= start &&
- cached->end > start)
- node = &cached->rb_node;
- else
- node = tree_search(tree, start);
- while (node && start <= end) {
- state = rb_entry(node, struct extent_state, rb_node);
+ struct io_failure_record *failrec = ERR_PTR(-ENOENT);
- if (filled && state->start > start) {
- bitset = 0;
- break;
- }
-
- if (state->start > end)
- break;
-
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
- break;
- }
-
- if (state->end == (u64)-1)
- break;
-
- start = state->end + 1;
- if (start > end)
- break;
- node = rb_next(node);
- if (!node) {
- if (filled)
- bitset = 0;
- break;
- }
- }
- spin_unlock(&tree->lock);
- return bitset;
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search(&inode->io_failure_tree, start);
+ if (node)
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ spin_unlock(&inode->io_failure_lock);
+ return failrec;
}
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
+static void free_io_failure(struct btrfs_inode *inode,
+ struct io_failure_record *rec)
{
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
- SetPageUptodate(page);
-}
-
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec)
-{
- int ret;
- int err = 0;
-
- set_state_failrec(failure_tree, rec->start, NULL);
- ret = clear_extent_bits(failure_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret)
- err = ret;
-
- ret = clear_extent_bits(io_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_DAMAGED);
- if (ret && !err)
- err = ret;
+ spin_lock(&inode->io_failure_lock);
+ rb_erase(&rec->rb_node, &inode->io_failure_tree);
+ spin_unlock(&inode->io_failure_lock);
kfree(rec);
- return err;
}
/*
@@ -2186,26 +537,28 @@ int free_io_failure(struct extent_io_tree *failure_tree,
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ u64 length, u64 logical, struct page *page,
+ unsigned int pg_offset, int mirror_num)
{
- struct bio *bio;
struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
u64 map_length = 0;
u64 sector;
- struct btrfs_bio *bbio = NULL;
- int ret;
+ struct btrfs_io_context *bioc = NULL;
+ int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
- bio = btrfs_io_bio_alloc(1);
- bio->bi_iter.bi_size = 0;
+ if (btrfs_repair_one_zone(fs_info, logical))
+ return 0;
+
map_length = length;
/*
- * Avoid races with device replace and make sure our bbio has devices
+ * Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
@@ -2218,56 +571,54 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bbio, 0);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
- ASSERT(bbio->mirror_num == 1);
+ &map_length, &bioc, 0);
+ if (ret)
+ goto out_counter_dec;
+ ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
- BUG_ON(mirror_num != bbio->mirror_num);
+ &map_length, &bioc, mirror_num);
+ if (ret)
+ goto out_counter_dec;
+ BUG_ON(mirror_num != bioc->mirror_num);
}
- sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
- bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[bbio->mirror_num - 1].dev;
- btrfs_put_bbio(bbio);
+ sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
+ dev = bioc->stripes[bioc->mirror_num - 1].dev;
+ btrfs_put_bioc(bioc);
+
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ ret = -EIO;
+ goto out_counter_dec;
}
- bio_set_dev(bio, dev->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
- bio_add_page(bio, page, length, pg_offset);
- if (btrfsic_submit_bio_wait(bio)) {
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
/* try to remap that extent elsewhere? */
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- return -EIO;
+ goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start,
rcu_str_deref(dev->name), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return 0;
+ return ret;
}
-int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 start = eb->start;
@@ -2290,63 +641,59 @@ int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
return ret;
}
+static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == failrec->num_copies)
+ return cur_mirror + 1 - failrec->num_copies;
+ return cur_mirror + 1;
+}
+
+static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == 1)
+ return failrec->num_copies;
+ return cur_mirror - 1;
+}
+
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset)
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset)
{
- u64 private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 ino = btrfs_ino(inode);
+ u64 locked_start, locked_end;
struct io_failure_record *failrec;
- struct extent_state *state;
- int num_copies;
+ int mirror;
int ret;
- private = 0;
- ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
- EXTENT_DIRTY, 0);
- if (!ret)
- return 0;
-
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret)
+ failrec = get_failrec(inode, start);
+ if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
- if (failrec->in_validation) {
- /* there was no real error, just free the record */
- btrfs_debug(fs_info,
- "clean_io_failure: freeing dummy error at %llu",
- failrec->start);
- goto out;
- }
if (sb_rdonly(fs_info->sb))
goto out;
- spin_lock(&io_tree->lock);
- state = find_first_extent_bit_state(io_tree,
- failrec->start,
- EXTENT_LOCKED);
- spin_unlock(&io_tree->lock);
-
- if (state && state->start <= failrec->start &&
- state->end >= failrec->start + failrec->len - 1) {
- num_copies = btrfs_num_copies(fs_info, failrec->logical,
- failrec->len);
- if (num_copies > 1) {
- repair_io_failure(fs_info, ino, start, failrec->len,
- failrec->logical, page, pg_offset,
- failrec->failed_mirror);
- }
- }
+ ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+ &locked_end, EXTENT_LOCKED, NULL);
+ if (ret || locked_start > failrec->bytenr ||
+ locked_end < failrec->bytenr + failrec->len - 1)
+ goto out;
-out:
- free_io_failure(failure_tree, io_tree, failrec);
+ mirror = failrec->this_mirror;
+ do {
+ mirror = prev_mirror(failrec, mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset, mirror);
+ } while (mirror != failrec->failed_mirror);
+out:
+ free_io_failure(inode, failrec);
return 0;
}
@@ -2358,284 +705,288 @@ out:
*/
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct extent_io_tree *failure_tree = &inode->io_failure_tree;
struct io_failure_record *failrec;
- struct extent_state *state, *next;
+ struct rb_node *node, *next;
- if (RB_EMPTY_ROOT(&failure_tree->state))
+ if (RB_EMPTY_ROOT(&inode->io_failure_tree))
return;
- spin_lock(&failure_tree->lock);
- state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
- while (state) {
- if (state->start > end)
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search_first(&inode->io_failure_tree, start);
+ while (node) {
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ if (failrec->bytenr > end)
break;
- ASSERT(state->end <= end);
-
- next = next_state(state);
-
- failrec = state->failrec;
- free_extent_state(state);
+ next = rb_next(node);
+ rb_erase(&failrec->rb_node, &inode->io_failure_tree);
kfree(failrec);
- state = next;
+ node = next;
}
- spin_unlock(&failure_tree->lock);
+ spin_unlock(&inode->io_failure_lock);
}
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret)
+static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
+ struct btrfs_bio *bbio,
+ unsigned int bio_offset)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 start = bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct extent_map *em;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
- u64 logical;
-
- ret = get_state_failrec(failure_tree, start, &failrec);
- if (ret) {
- failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
- if (!failrec)
- return -ENOMEM;
-
- failrec->start = start;
- failrec->len = end - start + 1;
- failrec->this_mirror = 0;
- failrec->bio_flags = 0;
- failrec->in_validation = 0;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return -EIO;
- }
-
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
- kfree(failrec);
- return -EIO;
- }
-
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags,
- em->compress_type);
- }
+ failrec = get_failrec(BTRFS_I(inode), start);
+ if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
- /* set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0)
- ret = set_state_failrec(failure_tree, start, failrec);
- /* set the bits in the inode's tree */
- if (ret >= 0)
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
- if (ret < 0) {
- kfree(failrec);
- return ret;
- }
- } else {
- btrfs_debug(fs_info,
- "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
- failrec->logical, failrec->start, failrec->len,
- failrec->in_validation);
+ "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
+ failrec->logical, failrec->bytenr, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
+ ASSERT(failrec->this_mirror == bbio->mirror_num);
+ ASSERT(failrec->len == fs_info->sectorsize);
+ return failrec;
}
- *failrec_ret = failrec;
+ failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+ if (!failrec)
+ return ERR_PTR(-ENOMEM);
- return 0;
-}
+ RB_CLEAR_NODE(&failrec->rb_node);
+ failrec->bytenr = start;
+ failrec->len = sectorsize;
+ failrec->failed_mirror = bbio->mirror_num;
+ failrec->this_mirror = bbio->mirror_num;
+ failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
-bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- struct io_failure_record *failrec, int failed_mirror)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int num_copies;
+ btrfs_debug(fs_info,
+ "new io failure record logical %llu start %llu",
+ failrec->logical, start);
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- if (num_copies == 1) {
+ failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
+ if (failrec->num_copies == 1) {
/*
- * we only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. no
+ * We only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. No
* matter what the error is, it is very likely to persist.
*/
btrfs_debug(fs_info,
- "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return false;
+ "cannot repair logical %llu num_copies %d",
+ failrec->logical, failrec->num_copies);
+ kfree(failrec);
+ return ERR_PTR(-EIO);
+ }
+
+ /* Set the bits in the private failure tree */
+ ret = insert_failrec(BTRFS_I(inode), failrec);
+ if (ret) {
+ kfree(failrec);
+ return ERR_PTR(ret);
}
+ return failrec;
+}
+
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
+ submit_bio_hook_t *submit_bio_hook)
+{
+ u64 start = failed_bbio->file_offset + bio_offset;
+ struct io_failure_record *failrec;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct bio *failed_bio = &failed_bbio->bio;
+ const int icsum = bio_offset >> fs_info->sectorsize_bits;
+ struct bio *repair_bio;
+ struct btrfs_bio *repair_bbio;
+
+ btrfs_debug(fs_info,
+ "repair read error: read error at %llu", start);
+
+ BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+
+ failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
+ if (IS_ERR(failrec))
+ return PTR_ERR(failrec);
+
/*
- * there are two premises:
- * a) deliver good data to the caller
- * b) correct the bad sectors on disk
+ * There are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ *
+ * Since we're only doing repair for one sector, we only need to get
+ * a good copy of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
*/
- if (failed_bio_pages > 1) {
- /*
- * to fulfill b), we need to know the exact failing sectors, as
- * we don't want to rewrite any more than the failed ones. thus,
- * we need separate read requests for the failed bio
- *
- * if the following BUG_ON triggers, our validation request got
- * merged. we need separate requests for our algorithm to work.
- */
- BUG_ON(failrec->in_validation);
- failrec->in_validation = 1;
- failrec->this_mirror = failed_mirror;
- } else {
- /*
- * we're ready to fulfill a) and b) alongside. get a good copy
- * of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
- */
- if (failrec->in_validation) {
- BUG_ON(failrec->this_mirror != failed_mirror);
- failrec->in_validation = 0;
- failrec->this_mirror = 0;
- }
- failrec->failed_mirror = failed_mirror;
- failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
+ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
+ if (failrec->this_mirror == failrec->failed_mirror) {
+ btrfs_debug(fs_info,
+ "failed to repair num_copies %d this_mirror %d failed_mirror %d",
+ failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
+ free_io_failure(BTRFS_I(inode), failrec);
+ return -EIO;
}
- if (failrec->this_mirror > num_copies) {
- btrfs_debug(fs_info,
- "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return false;
+ repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+ failed_bbio->private);
+ repair_bbio = btrfs_bio(repair_bio);
+ repair_bbio->file_offset = start;
+ repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
+
+ if (failed_bbio->csum) {
+ const u32 csum_size = fs_info->csum_size;
+
+ repair_bbio->csum = repair_bbio->csum_inline;
+ memcpy(repair_bbio->csum,
+ failed_bbio->csum + csum_size * icsum, csum_size);
}
- return true;
-}
+ bio_add_page(repair_bio, page, failrec->len, pgoff);
+ repair_bbio->iter = repair_bio->bi_iter;
+
+ btrfs_debug(btrfs_sb(inode->i_sb),
+ "repair read error: submitting new read to mirror %d",
+ failrec->this_mirror);
+ /*
+ * At this point we have a bio, so any errors from submit_bio_hook()
+ * will be handled by the endio on the repair_bio, so we can't return an
+ * error here.
+ */
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
+ return BLK_STS_OK;
+}
-struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- struct io_failure_record *failrec,
- struct page *page, int pg_offset, int icsum,
- bio_end_io_t *endio_func, void *data)
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio *bio;
- struct btrfs_io_bio *btrfs_failed_bio;
- struct btrfs_io_bio *btrfs_bio;
-
- bio = btrfs_io_bio_alloc(1);
- bio->bi_end_io = endio_func;
- bio->bi_iter.bi_sector = failrec->logical >> 9;
- bio->bi_iter.bi_size = 0;
- bio->bi_private = data;
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
- btrfs_failed_bio = btrfs_io_bio(failed_bio);
- if (btrfs_failed_bio->csum) {
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_bio->csum = btrfs_bio->csum_inline;
- icsum *= csum_size;
- memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
- csum_size);
+ if (uptodate) {
+ if (fsverity_active(page->mapping->host) &&
+ !PageError(page) &&
+ !PageUptodate(page) &&
+ start < i_size_read(page->mapping->host) &&
+ !fsverity_verify_page(page)) {
+ btrfs_page_set_error(fs_info, page, start, len);
+ } else {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ }
+ } else {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
}
- bio_add_page(bio, page, failrec->len, pg_offset);
+ if (!btrfs_is_subpage(fs_info, page))
+ unlock_page(page);
+ else
+ btrfs_subpage_end_reader(fs_info, page, start, len);
+}
+
+static void end_sector_io(struct page *page, u64 offset, bool uptodate)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct extent_state *cached = NULL;
- return bio;
+ end_page_read(page, uptodate, offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached, GFP_ATOMIC);
+ unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
+ &cached);
}
-/*
- * This is a generic handler for readpage errors. If other copies exist, read
- * those and write back good data to the failed position. Does not investigate
- * in remapping the failed extent elsewhere, hoping the device will be smart
- * enough to do this as needed
- */
-static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int failed_mirror)
+static void submit_data_read_repair(struct inode *inode,
+ struct btrfs_bio *failed_bbio,
+ u32 bio_offset, const struct bio_vec *bvec,
+ unsigned int error_bitmap)
{
- struct io_failure_record *failrec;
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct bio *bio;
- int read_mode = 0;
- blk_status_t status;
- int ret;
- unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
+ const unsigned int pgoff = bvec->bv_offset;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct page *page = bvec->bv_page;
+ const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
+ const u64 end = start + bvec->bv_len - 1;
+ const u32 sectorsize = fs_info->sectorsize;
+ const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
+ int i;
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return ret;
+ /* This repair is only for data */
+ ASSERT(is_data_inode(inode));
- if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
- failed_mirror)) {
- free_io_failure(failure_tree, tree, failrec);
- return -EIO;
- }
+ /* We're here because we had some read errors or csum mismatch */
+ ASSERT(error_bitmap);
- if (failed_bio_pages > 1)
- read_mode |= REQ_FAILFAST_DEV;
+ /*
+ * We only get called on buffered IO, thus page must be mapped and bio
+ * must not be cloned.
+ */
+ ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- start - page_offset(page),
- (int)phy_offset, failed_bio->bi_end_io,
- NULL);
- bio->bi_opf = REQ_OP_READ | read_mode;
+ /* Iterate through all the sectors in the range */
+ for (i = 0; i < nr_bits; i++) {
+ const unsigned int offset = i * sectorsize;
+ bool uptodate = false;
+ int ret;
- btrfs_debug(btrfs_sb(inode->i_sb),
- "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
- read_mode, failrec->this_mirror, failrec->in_validation);
+ if (!(error_bitmap & (1U << i))) {
+ /*
+ * This sector has no error, just end the page read
+ * and unlock the range.
+ */
+ uptodate = true;
+ goto next;
+ }
- status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
- failrec->bio_flags);
- if (status) {
- free_io_failure(failure_tree, tree, failrec);
- bio_put(bio);
- ret = blk_status_to_errno(status);
+ ret = btrfs_repair_one_sector(inode, failed_bbio,
+ bio_offset + offset, page, pgoff + offset,
+ btrfs_submit_data_read_bio);
+ if (!ret) {
+ /*
+ * We have submitted the read repair, the page release
+ * will be handled by the endio function of the
+ * submitted repair bio.
+ * Thus we don't need to do any thing here.
+ */
+ continue;
+ }
+ /*
+ * Continue on failed repair, otherwise the remaining sectors
+ * will not be properly unlocked.
+ */
+next:
+ end_sector_io(page, start + offset, uptodate);
}
-
- return ret;
}
/* lots and lots of room for performance fixes in the end_bio funcs */
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
- int uptodate = (err == 0);
+ struct btrfs_inode *inode;
+ const bool uptodate = (err == 0);
int ret = 0;
- btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
+ ASSERT(page && page->mapping);
+ inode = BTRFS_I(page->mapping->host);
+ btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
if (!uptodate) {
- ClearPageUptodate(page);
- SetPageError(page);
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 len;
+
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
+
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
@@ -2650,56 +1001,155 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio)
+static void end_bio_extent_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
u64 end;
struct bvec_iter_all iter_all;
+ bool first_bvec = true;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ const u32 sectorsize = fs_info->sectorsize;
+
+ /* Our read/write should always be sector aligned. */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page write with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+
+ if (first_bvec) {
+ btrfs_record_physical_zoned(inode, start, bio);
+ first_bvec = false;
}
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
-
end_extent_writepage(page, error, start, end);
- end_page_writeback(page);
+
+ btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
}
bio_put(bio);
}
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
- int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+ struct btrfs_inode *inode;
+ /* Start of the range in @inode */
+ u64 start;
+ /* End of the range in @inode */
+ u64 end;
+ bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+ struct btrfs_inode *inode, u64 start, u64 end,
+ bool uptodate)
{
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ struct extent_io_tree *tree;
- if (uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, start, end, &cached);
+ /* The first extent, initialize @processed */
+ if (!processed->inode)
+ goto update;
+
+ /*
+ * Contiguous to processed extent, just uptodate the end.
+ *
+ * Several things to notice:
+ *
+ * - bio can be merged as long as on-disk bytenr is contiguous
+ * This means we can have page belonging to other inodes, thus need to
+ * check if the inode still matches.
+ * - bvec can contain range beyond current page for multi-page bvec
+ * Thus we need to do processed->end + 1 >= start check
+ */
+ if (processed->inode == inode && processed->uptodate == uptodate &&
+ processed->end + 1 >= start && end >= processed->end) {
+ processed->end = end;
+ return;
+ }
+
+ tree = &processed->inode->io_tree;
+ /*
+ * Now we don't have range contiguous to the processed range, release
+ * the processed range now.
+ */
+ unlock_extent_atomic(tree, processed->start, processed->end, &cached);
+
+update:
+ /* Update processed to current range */
+ processed->inode = inode;
+ processed->start = start;
+ processed->end = end;
+ processed->uptodate = uptodate;
+}
+
+static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
+{
+ ASSERT(PageLocked(page));
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page));
+ btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+}
+
+/*
+ * Find extent buffer for a givne bytenr.
+ *
+ * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
+ * in endio context.
+ */
+static struct extent_buffer *find_extent_buffer_readpage(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *eb;
+
+ /*
+ * For regular sectorsize, we can use page->private to grab extent
+ * buffer
+ */
+ if (fs_info->nodesize >= PAGE_SIZE) {
+ ASSERT(PagePrivate(page) && page->private);
+ return (struct extent_buffer *)page->private;
+ }
+
+ /* For subpage case, we need to lookup buffer radix tree */
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ bytenr >> fs_info->sectorsize_bits);
+ rcu_read_unlock();
+ ASSERT(eb);
+ return eb;
}
/*
@@ -2713,325 +1163,541 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio)
+static void end_bio_extent_readpage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
- int uptodate = !bio->bi_status;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *tree, *failure_tree;
- u64 offset = 0;
- u64 start;
- u64 end;
- u64 len;
- u64 extent_start = 0;
- u64 extent_len = 0;
+ struct processed_extent processed = { 0 };
+ /*
+ * The offset to the beginning of a bio, since one bio can never be
+ * larger than UINT_MAX, u32 here is enough.
+ */
+ u32 bio_offset = 0;
int mirror;
- int ret;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
+ bool uptodate = !bio->bi_status;
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
+ const u32 sectorsize = fs_info->sectorsize;
+ unsigned int error_bitmap = (unsigned int)-1;
+ bool repair = false;
+ u64 start;
+ u64 end;
+ u32 len;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_status,
- io_bio->mirror_num);
- tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
-
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
+ bio->bi_iter.bi_sector, bio->bi_status,
+ bbio->mirror_num);
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ /*
+ * We always issue full-sector reads, but if some block in a
+ * page fails to read, blk_update_request() will advance
+ * bv_offset and adjust bv_len to compensate. Print a warning
+ * for unaligned offsets, and an error if they don't add up to
+ * a full sector.
+ */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
+ sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page read with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
len = bvec->bv_len;
- mirror = io_bio->mirror_num;
+ mirror = bbio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
- if (ret)
- uptodate = 0;
- else
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, tree, start,
- page,
- btrfs_ino(BTRFS_I(inode)), 0);
+ if (is_data_inode(inode)) {
+ error_bitmap = btrfs_verify_data_csum(bbio,
+ bio_offset, page, start, end);
+ if (error_bitmap)
+ uptodate = false;
+ } else {
+ if (btrfs_validate_metadata_buffer(bbio,
+ page, start, end, mirror))
+ uptodate = false;
+ }
}
- if (likely(uptodate))
- goto readpage_ok;
+ if (likely(uptodate)) {
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
- if (data_inode) {
+ btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
/*
- * The generic bio_readpage_error handles errors the
- * following way: If possible, new read requests are
- * created and submitted and will end up in
- * end_bio_extent_readpage as well (if we're lucky,
- * not in the !uptodate case). In that case it returns
- * 0 and we just go on with the next page in our bio.
- * If it can't handle the error it will return -EIO and
- * we remain responsible for that page.
+ * Zero out the remaining part if this range straddles
+ * i_size.
+ *
+ * Here we should only zero the range inside the bvec,
+ * not touch anything else.
+ *
+ * NOTE: i_size is exclusive while end is inclusive.
*/
- ret = bio_readpage_error(bio, offset, page, start, end,
- mirror);
- if (ret == 0) {
- uptodate = !bio->bi_status;
- offset += len;
- continue;
+ if (page->index == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_page(i_size),
+ offset_in_page(start));
+
+ zero_user_segment(page, zero_start,
+ offset_in_page(end) + 1);
}
+ } else if (is_data_inode(inode)) {
+ /*
+ * Only try to repair bios that actually made it to a
+ * device. If the bio failed to be submitted mirror
+ * is 0 and we need to fail it without retrying.
+ *
+ * This also includes the high level bios for compressed
+ * extents - these never make it to a device and repair
+ * is already handled on the lower compressed bio.
+ */
+ if (mirror > 0)
+ repair = true;
} else {
struct extent_buffer *eb;
- eb = (struct extent_buffer *)page->private;
+ eb = find_extent_buffer_readpage(fs_info, page, start);
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror;
atomic_dec(&eb->io_pages);
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
- &eb->bflags))
- btree_readahead_hook(eb, -EIO);
}
-readpage_ok:
- if (likely(uptodate)) {
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned off;
- /* Zero out the end if this page straddles i_size */
- off = offset_in_page(i_size);
- if (page->index == end_index && off)
- zero_user_segment(page, off, PAGE_SIZE);
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- offset += len;
-
- if (unlikely(!uptodate)) {
- if (extent_len) {
- endio_readpage_release_extent(tree,
- extent_start,
- extent_len, 1);
- extent_start = 0;
- extent_len = 0;
- }
- endio_readpage_release_extent(tree, start,
- end - start + 1, 0);
- } else if (!extent_len) {
- extent_start = start;
- extent_len = end + 1 - start;
- } else if (extent_start + extent_len == start) {
- extent_len += end + 1 - start;
+ if (repair) {
+ /*
+ * submit_data_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
+ */
+ submit_data_read_repair(inode, bbio, bio_offset, bvec,
+ error_bitmap);
} else {
- endio_readpage_release_extent(tree, extent_start,
- extent_len, uptodate);
- extent_start = start;
- extent_len = end + 1 - start;
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, PageUptodate(page));
}
- }
- if (extent_len)
- endio_readpage_release_extent(tree, extent_start, extent_len,
- uptodate);
- btrfs_io_bio_free_csum(io_bio);
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
+
+ }
+ /* Release the last extent */
+ endio_readpage_release_extent(&processed, NULL, 0, 0, false);
+ btrfs_bio_free_csum(bbio);
bio_put(bio);
}
-/*
- * Initialize the members up to but not including 'bio'. Use after allocating a
- * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
- * 'bio' because use of __GFP_ZERO is not supported.
+/**
+ * Populate every free slot in a provided array with pages.
+ *
+ * @nr_pages: number of pages to allocate
+ * @page_array: the array to fill with pages; any existing non-null entries in
+ * the array will be skipped
+ *
+ * Return: 0 if all pages were able to be allocated;
+ * -ENOMEM otherwise, and the caller is responsible for freeing all
+ * non-null page pointers in the array.
*/
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
{
- memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+ unsigned int allocated;
+
+ for (allocated = 0; allocated < nr_pages;) {
+ unsigned int last = allocated;
+
+ allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+
+ if (allocated == nr_pages)
+ return 0;
+
+ /*
+ * During this iteration, no page could be allocated, even
+ * though alloc_pages_bulk_array() falls back to alloc_page()
+ * if it could not bulk-allocate. So we must be out of memory.
+ */
+ if (allocated == last)
+ return -ENOMEM;
+
+ memalloc_retry_wait(GFP_NOFS);
+ }
+ return 0;
}
-/*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail. We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+/**
+ * Attempt to add a page to bio
+ *
+ * @bio_ctrl: record both the bio, and its bio_flags
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @size: portion of page that we want to write
+ * @pg_offset: starting offset in the page
+ * @compress_type: compression type of the current bio to see if we can merge them
+ *
+ * Attempt to add a page to bio considering stripe alignment etc.
+ *
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
*/
-struct bio *btrfs_bio_alloc(u64 first_byte)
-{
- struct bio *bio;
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ enum btrfs_compression_type compress_type)
+{
+ struct bio *bio = bio_ctrl->bio;
+ u32 bio_size = bio->bi_iter.bi_size;
+ u32 real_size;
+ const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ bool contig = false;
+ int ret;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
- bio->bi_iter.bi_sector = first_byte >> 9;
- btrfs_io_bio_init(btrfs_io_bio(bio));
- return bio;
-}
+ ASSERT(bio);
+ /* The limit should be calculated when bio_ctrl->bio is allocated */
+ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
+ if (bio_ctrl->compress_type != compress_type)
+ return 0;
-struct bio *btrfs_bio_clone(struct bio *bio)
-{
- struct btrfs_io_bio *btrfs_bio;
- struct bio *new;
- /* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
- btrfs_bio = btrfs_io_bio(new);
- btrfs_io_bio_init(btrfs_bio);
- btrfs_bio->iter = bio->bi_iter;
- return new;
+ if (bio->bi_iter.bi_size == 0) {
+ /* We can always add a page into an empty bio. */
+ contig = true;
+ } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+
+ /*
+ * The contig check requires the following conditions to be met:
+ * 1) The pages are belonging to the same inode
+ * This is implied by the call chain.
+ *
+ * 2) The range has adjacent logical bytenr
+ *
+ * 3) The range has adjacent file offset
+ * This is required for the usage of btrfs_bio->file_offset.
+ */
+ if (bio_end_sector(bio) == sector &&
+ page_offset(bvec->bv_page) + bvec->bv_offset +
+ bvec->bv_len == page_offset(page) + pg_offset)
+ contig = true;
+ } else {
+ /*
+ * For compression, all IO should have its logical bytenr
+ * set to the starting bytenr of the compressed extent.
+ */
+ contig = bio->bi_iter.bi_sector == sector;
+ }
+
+ if (!contig)
+ return 0;
+
+ real_size = min(bio_ctrl->len_to_oe_boundary,
+ bio_ctrl->len_to_stripe_boundary) - bio_size;
+ real_size = min(real_size, size);
+
+ /*
+ * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+ * bio will still execute its endio function on the page!
+ */
+ if (real_size == 0)
+ return 0;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
+ else
+ ret = bio_add_page(bio, page, real_size, pg_offset);
+
+ return ret;
}
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
+static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+ struct btrfs_inode *inode, u64 file_offset)
{
- struct bio *bio;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_io_geometry geom;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em;
+ u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
+ int ret;
+
+ /*
+ * Pages for compressed extent are never submitted to disk directly,
+ * thus it has no real boundary, just set them to U32_MAX.
+ *
+ * The split happens for real compressed bio, which happens in
+ * btrfs_submit_compressed_read/write().
+ */
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ return 0;
+ }
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
+ logical, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ return ret;
+ }
+ if (geom.len > U32_MAX)
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ else
+ bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
- /* Bio allocation backed by a bioset does not fail */
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
- btrfs_io_bio_init(btrfs_io_bio(bio));
- return bio;
+ if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (!ordered) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ ordered->disk_bytenr + ordered->disk_num_bytes - logical);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
}
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+static int alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ struct writeback_control *wbc,
+ blk_opf_t opf,
+ u64 disk_bytenr, u32 offset, u64 file_offset,
+ enum btrfs_compression_type compress_type)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
- struct btrfs_io_bio *btrfs_bio;
+ int ret;
- /* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
- ASSERT(bio);
+ ASSERT(bio_ctrl->end_io_func);
- btrfs_bio = btrfs_io_bio(bio);
- btrfs_io_bio_init(btrfs_bio);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
+ /*
+ * For compressed page range, its disk_bytenr is always @disk_bytenr
+ * passed in, no matter if we have added any range into previous bio.
+ */
+ if (compress_type != BTRFS_COMPRESS_NONE)
+ bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ else
+ bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
+ bio_ctrl->bio = bio;
+ bio_ctrl->compress_type = compress_type;
+ ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+ if (ret < 0)
+ goto error;
- bio_trim(bio, offset >> 9, size >> 9);
- btrfs_bio->iter = bio->bi_iter;
- return bio;
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
+
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
+ }
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
+ }
+ return 0;
+error:
+ bio_ctrl->bio = NULL;
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return ret;
}
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
- * @tree: tree so we can call our merge_bio hook
* @wbc: optional writeback control for io accounting
+ * @disk_bytenr: logical bytenr where the write will be
* @page: page to add to the bio
+ * @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @offset: starting offset in the page
- * @bio_ret: must be valid pointer, newly allocated bio will be stored there
- * @end_io_func: end_io callback for new bio
- * @mirror_num: desired mirror to read/write
- * @prev_bio_flags: flags of previous bio to see if we can merge the current one
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compress type for current bio
+ *
+ * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+ * new one in @bio_ctrl->bio.
+ * The mirror number for this IO should already be initizlied in
+ * @bio_ctrl->mirror_num.
*/
-static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
+static int submit_extent_page(blk_opf_t opf,
struct writeback_control *wbc,
- struct page *page, u64 offset,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, struct page *page,
size_t size, unsigned long pg_offset,
- struct bio **bio_ret,
- bio_end_io_t end_io_func,
- int mirror_num,
- unsigned long prev_bio_flags,
- unsigned long bio_flags,
+ enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
int ret = 0;
- struct bio *bio;
- size_t page_size = min_t(size_t, size, PAGE_SIZE);
- sector_t sector = offset >> 9;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ unsigned int cur = pg_offset;
- ASSERT(bio_ret);
+ ASSERT(bio_ctrl);
- if (*bio_ret) {
- bool contig;
- bool can_merge = true;
+ ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
+ pg_offset + size <= PAGE_SIZE);
- bio = *bio_ret;
- if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
- contig = bio->bi_iter.bi_sector == sector;
- else
- contig = bio_end_sector(bio) == sector;
+ ASSERT(bio_ctrl->end_io_func);
- ASSERT(tree->ops);
- if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
- can_merge = false;
+ if (force_bio_submit)
+ submit_one_bio(bio_ctrl);
- if (prev_bio_flags != bio_flags || !contig || !can_merge ||
- force_bio_submit ||
- bio_add_page(bio, page, page_size, pg_offset) < page_size) {
- ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
- if (ret < 0) {
- *bio_ret = NULL;
+ while (cur < pg_offset + size) {
+ u32 offset = cur - pg_offset;
+ int added;
+
+ /* Allocate new bio if needed */
+ if (!bio_ctrl->bio) {
+ ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+ disk_bytenr, offset,
+ page_offset(page) + cur,
+ compress_type);
+ if (ret < 0)
return ret;
- }
- bio = NULL;
- } else {
- if (wbc)
- wbc_account_cgroup_owner(wbc, page, page_size);
- return 0;
}
+ /*
+ * We must go through btrfs_bio_add_page() to ensure each
+ * page range won't cross various boundaries.
+ */
+ if (compress_type != BTRFS_COMPRESS_NONE)
+ added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+ size - offset, pg_offset + offset,
+ compress_type);
+ else
+ added = btrfs_bio_add_page(bio_ctrl, page,
+ disk_bytenr + offset, size - offset,
+ pg_offset + offset, compress_type);
+
+ /* Metadata page range should never be split */
+ if (!is_data_inode(&inode->vfs_inode))
+ ASSERT(added == 0 || added == size - offset);
+
+ /* At least we added some page, update the account */
+ if (wbc && added)
+ wbc_account_cgroup_owner(wbc, page, added);
+
+ /* We have reached boundary, submit right now */
+ if (added < size - offset) {
+ /* The bio should contain some page(s) */
+ ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+ submit_one_bio(bio_ctrl);
+ }
+ cur += added;
}
+ return 0;
+}
- bio = btrfs_bio_alloc(offset);
- bio_add_page(bio, page, page_size, pg_offset);
- bio->bi_end_io = end_io_func;
- bio->bi_private = tree;
- bio->bi_write_hint = page->mapping->host->i_write_hint;
- bio->bi_opf = opf;
- if (wbc) {
- struct block_device *bdev;
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page,
+ struct btrfs_subpage *prealloc)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int ret = 0;
- bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, page_size);
+ /*
+ * If the page is mapped to btree inode, we should hold the private
+ * lock to prevent race.
+ * For cloned or dummy extent buffers, their pages are not mapped and
+ * will not race with any other ebs.
+ */
+ if (page->mapping)
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
+ WARN_ON(page->private != (unsigned long)eb);
+ return 0;
}
- *bio_ret = bio;
+ /* Already mapped, just free prealloc */
+ if (PagePrivate(page)) {
+ btrfs_free_subpage(prealloc);
+ return 0;
+ }
+ if (prealloc)
+ /* Has preallocated memory for subpage */
+ attach_page_private(page, prealloc);
+ else
+ /* Do new allocation to attach subpage */
+ ret = btrfs_attach_subpage(fs_info, page,
+ BTRFS_SUBPAGE_METADATA);
return ret;
}
-static void attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page)
+int set_page_extent_mapped(struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, (unsigned long)eb);
- } else {
- WARN_ON(page->private != (unsigned long)eb);
- }
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (PagePrivate(page))
+ return 0;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ if (btrfs_is_subpage(fs_info, page))
+ return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return 0;
}
-void set_page_extent_mapped(struct page *page)
+void clear_page_extent_mapped(struct page *page)
{
- if (!PagePrivate(page)) {
- SetPagePrivate(page);
- get_page(page);
- set_page_private(page, EXTENT_PAGE_PRIVATE);
- }
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (!PagePrivate(page))
+ return;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+ if (btrfs_is_subpage(fs_info, page))
+ return btrfs_detach_subpage(fs_info, page);
+
+ detach_page_private(page);
}
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3047,8 +1713,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR_OR_NULL(em)) {
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ if (em_cached && !IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
@@ -3062,106 +1728,88 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct extent_io_tree *tree,
- struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ blk_opf_t read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = page_offset(page);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
u64 block_start;
- u64 cur_end;
struct extent_map *em;
int ret = 0;
- int nr = 0;
size_t pg_offset = 0;
size_t iosize;
- size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
- unsigned long this_bio_flag = 0;
-
- set_page_extent_mapped(page);
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- if (!PageUptodate(page)) {
- if (cleancache_get_page(page) == 0) {
- BUG_ON(blocksize != PAGE_SIZE);
- unlock_extent(tree, start, end);
- goto out;
- }
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_extent(tree, start, end, NULL);
+ btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
+ unlock_page(page);
+ goto out;
}
if (page->index == last_byte >> PAGE_SHIFT) {
- char *userpage;
size_t zero_offset = offset_in_page(last_byte);
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
- userpage = kmap_atomic(page);
- memset(userpage + zero_offset, 0, iosize);
- flush_dcache_page(page);
- kunmap_atomic(userpage);
+ memzero_page(page, zero_offset, iosize);
}
}
+ bio_ctrl->end_io_func = end_bio_extent_readpage;
+ begin_page_read(fs_info, page);
while (cur <= end) {
+ unsigned long this_bio_flag = 0;
bool force_bio_submit = false;
- u64 offset;
+ u64 disk_bytenr;
+ ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- char *userpage;
struct extent_state *cached = NULL;
iosize = PAGE_SIZE - pg_offset;
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0, iosize);
- flush_dcache_page(page);
- kunmap_atomic(userpage);
+ memzero_page(page, pg_offset, iosize);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
- if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
- unlock_extent(tree, cur, end);
+ end - cur + 1, em_cached);
+ if (IS_ERR(em)) {
+ unlock_extent(tree, cur, end, NULL);
+ end_page_read(page, false, cur, end + 1 - cur);
+ ret = PTR_ERR(em);
break;
}
extent_offset = cur - em->start;
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- this_bio_flag |= EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&this_bio_flag,
- em->compress_type);
- }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
- cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
- disk_io_size = em->block_len;
- offset = em->block_start;
- } else {
- offset = em->block_start + extent_offset;
- disk_io_size = iosize;
- }
+ if (this_bio_flag != BTRFS_COMPRESS_NONE)
+ disk_bytenr = em->block_start;
+ else
+ disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3206,126 +1854,83 @@ static int __do_readpage(struct extent_io_tree *tree,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- char *userpage;
struct extent_state *cached = NULL;
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0, iosize);
- flush_dcache_page(page);
- kunmap_atomic(userpage);
+ memzero_page(page, pg_offset, iosize);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end,
- EXTENT_UPTODATE, 1, NULL)) {
- check_page_uptodate(tree, page);
- unlock_extent(tree, cur, cur + iosize - 1);
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
- /* we have an inline extent but it didn't get marked up
- * to date. Error out
- */
if (block_start == EXTENT_MAP_INLINE) {
- SetPageError(page);
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
- ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
- page, offset, disk_io_size,
- pg_offset, bio,
- end_bio_extent_readpage, mirror_num,
- *bio_flags,
- this_bio_flag,
+ ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
+ bio_ctrl, disk_bytenr, page, iosize,
+ pg_offset, this_bio_flag,
force_bio_submit);
- if (!ret) {
- nr++;
- *bio_flags = this_bio_flag;
- } else {
- SetPageError(page);
- unlock_extent(tree, cur, cur + iosize - 1);
+ if (ret) {
+ /*
+ * We have to unlock the remaining range, or the page
+ * will never be unlocked.
+ */
+ unlock_extent(tree, cur, end, NULL);
+ end_page_read(page, false, cur, end + 1 - cur);
goto out;
}
cur = cur + iosize;
pg_offset += iosize;
}
out:
- if (!nr) {
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
- }
return ret;
}
-static inline void contiguous_readpages(struct extent_io_tree *tree,
- struct page *pages[], int nr_pages,
- u64 start, u64 end,
- struct extent_map **em_cached,
- struct bio **bio,
- unsigned long *bio_flags,
- u64 *prev_em_start)
-{
- struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
- int index;
-
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
-
- for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
- put_page(pages[index]);
- }
-}
-
-static int __extent_read_full_page(struct extent_io_tree *tree,
- struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
+int btrfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ /*
+ * If btrfs_do_readpage() failed we will want to submit the assembled
+ * bio to do the cleanup.
+ */
+ submit_one_bio(&bio_ctrl);
return ret;
}
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
+static inline void contiguous_readpages(struct page *pages[], int nr_pages,
+ u64 start, u64 end,
+ struct extent_map **em_cached,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 *prev_em_start)
{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
+ struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
+ int index;
- ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-static void update_nr_written(struct writeback_control *wbc,
- unsigned long nr_written)
-{
- wbc->nr_to_write -= nr_written;
+ for (index = 0; index < nr_pages; index++) {
+ btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
+ REQ_RAHEAD, prev_em_start);
+ put_page(pages[index]);
+ }
}
/*
@@ -3338,20 +1943,22 @@ static void update_nr_written(struct writeback_control *wbc,
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int writepage_delalloc(struct inode *inode,
- struct page *page, struct writeback_control *wbc,
- u64 delalloc_start, unsigned long *nr_written)
+static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
+ struct page *page, struct writeback_control *wbc)
{
- u64 page_end = delalloc_start + PAGE_SIZE - 1;
- bool found;
+ const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 delalloc_start = page_offset(page);
u64 delalloc_to_write = 0;
- u64 delalloc_end = 0;
+ /* How many pages are started by btrfs_run_delalloc_range() */
+ unsigned long nr_written = 0;
int ret;
int page_started = 0;
+ while (delalloc_start < page_end) {
+ u64 delalloc_end = page_end;
+ bool found;
- while (delalloc_end < page_end) {
- found = find_lock_delalloc_range(inode, page,
+ found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@@ -3359,17 +1966,11 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
continue;
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, &page_started, nr_written, wbc);
+ delalloc_end, &page_started, &nr_written, wbc);
if (ret) {
- SetPageError(page);
- /*
- * btrfs_run_delalloc_range should return < 0 for error
- * but just in case, we use > 0 here meaning the IO is
- * started, so we don't want to return > 0 unless
- * things are going well.
- */
- ret = ret < 0 ? ret : -EIO;
- goto done;
+ btrfs_page_set_error(inode->root->fs_info, page,
+ page_offset(page), PAGE_SIZE);
+ return ret;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3388,23 +1989,69 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
thresh);
}
- /* did the fill delalloc function already unlock and start
- * the IO?
- */
+ /* Did btrfs_run_dealloc_range() already unlock and start the IO? */
if (page_started) {
/*
- * we've unlocked the page, so we can't update
- * the mapping's writeback index, just update
- * nr_to_write.
+ * We've unlocked the page, so we can't update the mapping's
+ * writeback index, just update nr_to_write.
*/
- wbc->nr_to_write -= *nr_written;
+ wbc->nr_to_write -= nr_written;
return 1;
}
- ret = 0;
+ return 0;
+}
-done:
- return ret;
+/*
+ * Find the first byte we need to write.
+ *
+ * For subpage, one page can contain several sectors, and
+ * __extent_writepage_io() will just grab all extent maps in the page
+ * range and try to submit all non-inline/non-compressed extents.
+ *
+ * This is a big problem for subpage, we shouldn't re-submit already written
+ * data at all.
+ * This function will lookup subpage dirty bit to find which range we really
+ * need to submit.
+ *
+ * Return the next dirty range in [@start, @end).
+ * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ */
+static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+ struct page *page, u64 *start, u64 *end)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
+ u64 orig_start = *start;
+ /* Declare as unsigned long so we can use bitmap ops */
+ unsigned long flags;
+ int range_start_bit;
+ int range_end_bit;
+
+ /*
+ * For regular sector size == page size case, since one page only
+ * contains one sector, we return the page offset directly.
+ */
+ if (!btrfs_is_subpage(fs_info, page)) {
+ *start = page_offset(page);
+ *end = page_offset(page) + PAGE_SIZE;
+ return;
+ }
+
+ range_start_bit = spi->dirty_offset +
+ (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
+ /* We should have the page locked, but just in case */
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+ spi->dirty_offset + spi->bitmap_nr_bits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+
+ range_start_bit -= spi->dirty_offset;
+ range_end_bit -= spi->dirty_offset;
+
+ *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
+ *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}
/*
@@ -3415,35 +2062,31 @@ done:
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
loff_t i_size,
- unsigned long nr_written,
int *nr_ret)
{
- struct extent_io_tree *tree = epd->tree;
- u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
- u64 end;
- u64 cur = start;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 cur = page_offset(page);
+ u64 end = cur + PAGE_SIZE - 1;
u64 extent_offset;
u64 block_start;
- u64 iosize;
struct extent_map *em;
- size_t pg_offset = 0;
- size_t blocksize;
+ int saved_ret = 0;
int ret = 0;
int nr = 0;
- const unsigned int write_flags = wbc_to_write_flags(wbc);
+ enum req_op op = REQ_OP_WRITE;
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ bool has_error = false;
bool compressed;
- ret = btrfs_writepage_cow_fixup(page, start, page_end);
+ ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
- update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
}
@@ -3452,37 +2095,67 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(wbc, nr_written + 1);
-
- end = page_end;
- blocksize = inode->i_sb->s_blocksize;
+ wbc->nr_to_write--;
+ epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
while (cur <= end) {
+ u64 disk_bytenr;
u64 em_end;
- u64 offset;
+ u64 dirty_range_start = cur;
+ u64 dirty_range_end;
+ u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(page, cur,
- page_end, 1);
+ btrfs_writepage_endio_finish_ordered(inode, page, cur,
+ end, true);
+ /*
+ * This range is beyond i_size, thus we don't need to
+ * bother writing back.
+ * But we still need to clear the dirty subpage bit, or
+ * the next time the page gets dirtied, we will try to
+ * writeback the sectors with subpage dirty bits,
+ * causing writeback without ordered extent.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
break;
}
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
- end - cur + 1);
- if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
+
+ find_next_dirty_byte(fs_info, page, &dirty_range_start,
+ &dirty_range_end);
+ if (cur < dirty_range_start) {
+ cur = dirty_range_start;
+ continue;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
+ if (IS_ERR(em)) {
+ btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
break;
}
extent_offset = cur - em->start;
em_end = extent_map_end(em);
- BUG_ON(em_end <= cur);
- BUG_ON(end < cur);
- iosize = min(em_end - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
- offset = em->block_start + extent_offset;
+ ASSERT(cur <= em_end);
+ ASSERT(cur < end);
+ ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ disk_bytenr = em->block_start + extent_offset;
+
+ /*
+ * Note that em_end from extent_map_end() and dirty_range_end from
+ * find_next_dirty_byte() are all exclusive
+ */
+ iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
+
+ if (btrfs_use_zone_append(inode, em->block_start))
+ op = REQ_OP_ZONE_APPEND;
+
free_extent_map(em);
em = NULL;
@@ -3495,35 +2168,55 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
if (compressed)
nr++;
else
- btrfs_writepage_endio_finish_ordered(page, cur,
- cur + iosize - 1, 1);
+ btrfs_writepage_endio_finish_ordered(inode,
+ page, cur, cur + iosize - 1, true);
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
cur += iosize;
- pg_offset += iosize;
continue;
}
- btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
+ btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
- btrfs_err(BTRFS_I(inode)->root->fs_info,
+ btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- page, offset, iosize, pg_offset,
- &epd->bio,
- end_bio_extent_writepage,
- 0, 0, 0, false);
+ /*
+ * Although the PageDirty bit is cleared before entering this
+ * function, subpage dirty bit is not cleared.
+ * So clear subpage dirty bit here so next time we won't submit
+ * page for range already written to disk.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+
+ ret = submit_extent_page(op | write_flags, wbc,
+ &epd->bio_ctrl, disk_bytenr,
+ page, iosize,
+ cur - page_offset(page),
+ 0, false);
if (ret) {
- SetPageError(page);
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
+
+ btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
- end_page_writeback(page);
+ btrfs_page_clear_writeback(fs_info, page, cur,
+ iosize);
}
- cur = cur + iosize;
- pg_offset += iosize;
+ cur += iosize;
nr++;
}
+ /*
+ * If we finish without problem, we should not only clear page dirty,
+ * but also empty subpage dirty bits
+ */
+ if (!has_error)
+ btrfs_page_assert_not_dirty(fs_info, page);
+ else
+ ret = saved_ret;
*nr_ret = nr;
return ret;
}
@@ -3540,52 +2233,51 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
- u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u64 page_start = page_offset(page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned long nr_written = 0;
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
- ClearPageError(page);
+ btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+ page_offset(page), PAGE_SIZE);
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return 0;
}
- if (page->index == end_index) {
- char *userpage;
+ if (page->index == end_index)
+ memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0,
- PAGE_SIZE - pg_offset);
- kunmap_atomic(userpage);
- flush_dcache_page(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ SetPageError(page);
+ goto done;
}
- set_page_extent_mapped(page);
-
if (!epd->extent_locked) {
- ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
+ ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
if (ret == 1)
return 0;
if (ret)
goto done;
}
- ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
+ &nr);
if (ret == 1)
return 0;
@@ -3595,11 +2287,52 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (PageError(page)) {
- ret = ret < 0 ? ret : -EIO;
- end_extent_writepage(page, ret, start, page_end);
+ /*
+ * Here we used to have a check for PageError() and then set @ret and
+ * call end_extent_writepage().
+ *
+ * But in fact setting @ret here will cause different error paths
+ * between subpage and regular sectorsize.
+ *
+ * For regular page size, we never submit current page, but only add
+ * current page to current bio.
+ * The bio submission can only happen in next page.
+ * Thus if we hit the PageError() branch, @ret is already set to
+ * non-zero value and will not get updated for regular sectorsize.
+ *
+ * But for subpage case, it's possible we submit part of current page,
+ * thus can get PageError() set by submitted bio of the same page,
+ * while our @ret is still 0.
+ *
+ * So here we unify the behavior and don't set @ret.
+ * Error can still be properly passed to higher layer as page will
+ * be set error, here we just don't handle the IO failure.
+ *
+ * NOTE: This is just a hotfix for subpage.
+ * The root fix will be properly ending ordered extent when we hit
+ * an error during writeback.
+ *
+ * But that needs a bigger refactoring, as we not only need to grab the
+ * submitted OE, but also need to know exactly at which bytenr we hit
+ * the error.
+ * Currently the full page based __extent_writepage_io() is not
+ * capable of that.
+ */
+ if (PageError(page))
+ end_extent_writepage(page, ret, page_start, page_end);
+ if (epd->extent_locked) {
+ /*
+ * If epd->extent_locked, it's from extent_write_locked_range(),
+ * the page can either be locked by lock_page() or
+ * process_one_page().
+ * Let btrfs_page_unlock_writer() handle both cases.
+ */
+ ASSERT(wbc);
+ btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+ wbc->range_end + 1 - wbc->range_start);
+ } else {
+ unlock_page(page);
}
- unlock_page(page);
ASSERT(ret <= 0);
return ret;
}
@@ -3618,24 +2351,25 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
}
/*
- * Lock eb pages and flush the bio if we can't the locks
+ * Lock extent buffer status and pages for writeback.
+ *
+ * May try to flush write bio if we can't get the lock.
*
- * Return 0 if nothing went wrong
- * Return >0 is same as 0, except bio is not submitted
- * Return <0 if something went wrong, no page is locked
+ * Return 0 if the extent buffer doesn't need to be submitted.
+ * (E.g. the extent buffer is not dirty)
+ * Return >0 is the extent buffer is submitted to bio.
+ * Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages, failed_page_nr;
+ int i, num_pages;
int flush = 0;
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ submit_write_bio(epd, 0);
flush = 1;
btrfs_tree_lock(eb);
}
@@ -3645,9 +2379,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!epd->sync_io)
return 0;
if (!flush) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ submit_write_bio(epd, 0);
flush = 1;
}
while (1) {
@@ -3679,7 +2411,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
btrfs_tree_unlock(eb);
- if (!ret)
+ /*
+ * Either we don't need to submit any tree block, or we're submitting
+ * subpage eb.
+ * Subpage metadata doesn't use page locking at all, so we can skip
+ * the page locking.
+ */
+ if (!ret || fs_info->nodesize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@@ -3688,14 +2426,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- int err;
-
- err = flush_write_bio(epd);
- if (err < 0) {
- ret = err;
- failed_page_nr = i;
- goto err_unlock;
- }
+ submit_write_bio(epd, 0);
flush = 1;
}
lock_page(p);
@@ -3703,41 +2434,34 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
}
return ret;
-err_unlock:
- /* Unlock already locked pages */
- for (i = 0; i < failed_page_nr; i++)
- unlock_page(eb->pages[i]);
- /*
- * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
- * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
- * be made and undo everything done before.
- */
- btrfs_tree_lock(eb);
- spin_lock(&eb->refs_lock);
- set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- end_extent_buffer_writeback(eb);
- spin_unlock(&eb->refs_lock);
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
- fs_info->dirty_metadata_batch);
- btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- btrfs_tree_unlock(eb);
- return ret;
}
-static void set_btree_ioerr(struct page *page)
+static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
{
- struct extent_buffer *eb = (struct extent_buffer *)page->private;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, eb->start, eb->len);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
+ * We need to set the mapping with the io error as well because a write
+ * error will flip the file system readonly, and then syncfs() will
+ * return a 0 because we are readonly if we don't modify the err seq for
+ * the superblock.
+ */
+ mapping_set_error(page->mapping, -EIO);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
- fs_info = eb->fs_info;
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
eb->len, fs_info->dirty_metadata_batch);
@@ -3781,21 +2505,106 @@ static void set_btree_ioerr(struct page *page)
*/
switch (eb->log_index) {
case -1:
- set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
break;
case 0:
- set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
break;
case 1:
- set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
break;
default:
BUG(); /* unexpected, logic error */
}
}
-static void end_bio_extent_buffer_writepage(struct bio *bio)
+/*
+ * The endio specific version which won't touch any unsafe spinlock in endio
+ * context.
+ */
+static struct extent_buffer *find_extent_buffer_nolock(
+ struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ return eb;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * The endio function for subpage extent buffer write.
+ *
+ * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
+ * after all extent buffers in the page has finished their writeback.
+ */
+static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
+{
+ struct bio *bio = &bbio->bio;
+ struct btrfs_fs_info *fs_info;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
+ ASSERT(fs_info->nodesize < PAGE_SIZE);
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 bvec_end = bvec_start + bvec->bv_len - 1;
+ u64 cur_bytenr = bvec_start;
+
+ ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
+
+ /* Iterate through all extent buffers in the range */
+ while (cur_bytenr <= bvec_end) {
+ struct extent_buffer *eb;
+ int done;
+
+ /*
+ * Here we can't use find_extent_buffer(), as it may
+ * try to lock eb->refs_lock, which is not safe in endio
+ * context.
+ */
+ eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
+ ASSERT(eb);
+
+ cur_bytenr = eb->start + eb->len;
+
+ ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
+ done = atomic_dec_and_test(&eb->io_pages);
+ ASSERT(done);
+
+ if (bio->bi_status ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ ClearPageUptodate(page);
+ set_btree_ioerr(page, eb);
+ }
+
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+ eb->len);
+ end_extent_buffer_writeback(eb);
+ /*
+ * free_extent_buffer() will grab spinlock which is not
+ * safe in endio context. Thus here we manually dec
+ * the ref.
+ */
+ atomic_dec(&eb->refs);
+ }
+ }
+ bio_put(bio);
+}
+
+static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
@@ -3812,7 +2621,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
- set_btree_ioerr(page);
+ set_btree_ioerr(page, eb);
}
end_page_writeback(page);
@@ -3826,51 +2635,105 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
bio_put(bio);
}
-static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
- struct writeback_control *wbc,
- struct extent_page_data *epd)
+static void prepare_eb_write(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
- u64 offset = eb->start;
u32 nritems;
- int i, num_pages;
- unsigned long start, end;
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
- int ret = 0;
+ unsigned long start;
+ unsigned long end;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
- num_pages = num_extent_pages(eb);
- atomic_set(&eb->io_pages, num_pages);
+ atomic_set(&eb->io_pages, num_extent_pages(eb));
- /* set btree blocks beyond nritems with 0 to avoid stale content. */
+ /* Set btree blocks beyond nritems with 0 to avoid stale content */
nritems = btrfs_header_nritems(eb);
if (btrfs_header_level(eb) > 0) {
end = btrfs_node_key_ptr_offset(nritems);
-
memzero_extent_buffer(eb, end, eb->len - end);
} else {
/*
- * leaf:
+ * Leaf:
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems);
end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
memzero_extent_buffer(eb, start, end - start);
}
+}
+
+/*
+ * Unlike the work in write_one_eb(), we rely completely on extent locking.
+ * Page locking is only utilized at minimum to keep the VMM code happy.
+ */
+static int write_one_subpage_eb(struct extent_buffer *eb,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ bool no_dirty_ebs = false;
+ int ret;
+
+ prepare_eb_write(eb);
+
+ /* clear_page_dirty_for_io() in subpage helper needs page locked */
+ lock_page(page);
+ btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
+ /* Check if this is the last dirty bit to update nr_written */
+ no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
+ eb->start, eb->len);
+ if (no_dirty_ebs)
+ clear_page_dirty_for_io(page);
+
+ epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ &epd->bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, false);
+ if (ret) {
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
+ set_btree_ioerr(page, eb);
+ unlock_page(page);
+
+ if (atomic_dec_and_test(&eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ return -EIO;
+ }
+ unlock_page(page);
+ /*
+ * Submission finished without problem, if no range of the page is
+ * dirty anymore, we have submitted a page. Update nr_written in wbc.
+ */
+ if (no_dirty_ebs)
+ wbc->nr_to_write--;
+ return ret;
+}
+
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ u64 disk_bytenr = eb->start;
+ int i, num_pages;
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ int ret = 0;
+
+ prepare_eb_write(eb);
+
+ epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+
+ num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- p, offset, PAGE_SIZE, 0,
- &epd->bio,
- end_bio_extent_buffer_writepage,
- 0, 0, 0, false);
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ &epd->bio_ctrl, disk_bytenr, p,
+ PAGE_SIZE, 0, 0, false);
if (ret) {
- set_btree_ioerr(p);
+ set_btree_ioerr(p, eb);
if (PageWriteback(p))
end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
@@ -3878,8 +2741,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = -EIO;
break;
}
- offset += PAGE_SIZE;
- update_nr_written(wbc, 1);
+ disk_bytenr += PAGE_SIZE;
+ wbc->nr_to_write--;
unlock_page(p);
}
@@ -3894,17 +2757,206 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
return ret;
}
+/*
+ * Submit one subpage btree page.
+ *
+ * The main difference to submit_eb_page() is:
+ * - Page locking
+ * For subpage, we don't rely on page locking at all.
+ *
+ * - Flush write bio
+ * We only flush bio if we may be unable to fit current extent buffers into
+ * current bio.
+ *
+ * Return >=0 for the number of submitted extent buffers.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_subpage(struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ int submitted = 0;
+ u64 page_start = page_offset(page);
+ int bit_start = 0;
+ int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int ret;
+
+ /* Lock and write each dirty extent buffers in the range */
+ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct extent_buffer *eb;
+ unsigned long flags;
+ u64 start;
+
+ /*
+ * Take private lock to ensure the subpage won't be detached
+ * in the meantime.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ break;
+ }
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ subpage->bitmaps)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+ bit_start++;
+ continue;
+ }
+
+ start = page_start + bit_start * fs_info->sectorsize;
+ bit_start += sectors_per_node;
+
+ /*
+ * Here we just want to grab the eb without touching extra
+ * spin locks, so call find_extent_buffer_nolock().
+ */
+ eb = find_extent_buffer_nolock(fs_info, start);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+
+ /*
+ * The eb has already reached 0 refs thus find_extent_buffer()
+ * doesn't return it. We don't need to write back such eb
+ * anyway.
+ */
+ if (!eb)
+ continue;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret == 0) {
+ free_extent_buffer(eb);
+ continue;
+ }
+ if (ret < 0) {
+ free_extent_buffer(eb);
+ goto cleanup;
+ }
+ ret = write_one_subpage_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto cleanup;
+ submitted++;
+ }
+ return submitted;
+
+cleanup:
+ /* We hit error, end bio for the submitted extent buffers */
+ submit_write_bio(epd, ret);
+ return ret;
+}
+
+/*
+ * Submit all page(s) of one extent buffer.
+ *
+ * @page: the page of one extent buffer
+ * @eb_context: to determine if we need to submit this page, if current page
+ * belongs to this eb, we don't need to submit
+ *
+ * The caller should pass each page in their bytenr order, and here we use
+ * @eb_context to determine if we have submitted pages of one extent buffer.
+ *
+ * If we have, we just skip until we hit a new page that doesn't belong to
+ * current @eb_context.
+ *
+ * If not, we submit all the page(s) of the extent buffer.
+ *
+ * Return >0 if we have submitted the extent buffer successfully.
+ * Return 0 if we don't need to submit the page, as it's already submitted by
+ * previous call.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ struct extent_buffer **eb_context)
+{
+ struct address_space *mapping = page->mapping;
+ struct btrfs_block_group *cache = NULL;
+ struct extent_buffer *eb;
+ int ret;
+
+ if (!PagePrivate(page))
+ return 0;
+
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ return submit_eb_subpage(page, wbc, epd);
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON but no point
+ * crashing the machine for something we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ if (eb == *eb_context) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ return 0;
+
+ if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
+ /*
+ * If for_sync, this hole will be filled with
+ * trasnsaction commit.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ free_extent_buffer(eb);
+ return ret;
+ }
+
+ *eb_context = eb;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret <= 0) {
+ btrfs_revert_meta_write_pointer(cache, eb);
+ if (cache)
+ btrfs_put_block_group(cache);
+ free_extent_buffer(eb);
+ return ret;
+ }
+ if (cache) {
+ /*
+ * Implies write in zoned mode. Mark the last eb in a block group.
+ */
+ btrfs_schedule_zone_finish_bg(cache, eb);
+ btrfs_put_block_group(cache);
+ }
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
- struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
- .bio = NULL,
- .tree = tree,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -3933,6 +2985,7 @@ int btree_write_cache_pages(struct address_space *mapping,
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
+ btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
@@ -3944,55 +2997,13 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (!PagePrivate(page))
+ ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ if (ret == 0)
continue;
-
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- eb = (struct extent_buffer *)page->private;
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON
- * but no sense in crashing the users box for something
- * we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- if (eb == prev_eb) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
- if (!ret)
- continue;
-
- prev_eb = eb;
- ret = lock_extent_buffer_for_io(eb, &epd);
- if (!ret) {
- free_extent_buffer(eb);
- continue;
- } else if (ret < 0) {
- done = 1;
- free_extent_buffer(eb);
- break;
- }
-
- ret = write_one_eb(eb, wbc, &epd);
- if (ret) {
+ if (ret < 0) {
done = 1;
- free_extent_buffer(eb);
break;
}
- free_extent_buffer(eb);
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4013,20 +3024,52 @@ retry:
index = 0;
goto retry;
}
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
- ret = flush_write_bio(&epd);
+ /*
+ * If something went wrong, don't allow any metadata write bio to be
+ * submitted.
+ *
+ * This would prevent use-after-free if we had dirty pages not
+ * cleaned up, which can still happen by fuzzed images.
+ *
+ * - Bad extent tree
+ * Allowing existing tree block to be allocated for other trees.
+ *
+ * - Log tree operations
+ * Exiting tree blocks get allocated to log tree, bumps its
+ * generation, then get cleaned in tree re-balance.
+ * Such tree block will not be written back, since it's clean,
+ * thus no WRITTEN flag set.
+ * And after log writes back, this tree block is not traced by
+ * any dirty extent_io_tree.
+ *
+ * - Offending tree block gets re-dirtied from its original owner
+ * Since it has bumped generation, no WRITTEN flag, it can be
+ * reused without COWing. This tree block will not be traced
+ * by btrfs_transaction::dirty_pages.
+ *
+ * Now such dirty tree block will not be cleaned by any dirty
+ * extent io tree. Thus we don't want to submit such wild eb
+ * if the fs already has error.
+ *
+ * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * were submitted. Reset it to 0 to avoid false alerts for the caller.
+ */
+ if (ret > 0)
+ ret = 0;
+ if (!ret && BTRFS_FS_ERROR(fs_info))
+ ret = -EROFS;
+ submit_write_bio(&epd, ret);
+
+ btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * Walk the list of dirty pages of the given address space and write all of them.
+ *
* @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @data: data passed to __extent_writepage function
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @epd: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -4119,8 +3162,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
+ submit_write_bio(epd, 0);
lock_page(page);
}
@@ -4130,10 +3172,8 @@ retry:
}
if (wbc->sync_mode != WB_SYNC_NONE) {
- if (PageWriteback(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
- }
+ if (PageWriteback(page))
+ submit_write_bio(epd, 0);
wait_on_page_writeback(page);
}
@@ -4173,9 +3213,8 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- ret = flush_write_bio(epd);
- if (!ret)
- goto retry;
+ submit_write_bio(epd, 0);
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -4185,47 +3224,28 @@ retry:
return ret;
}
-int extent_write_full_page(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- struct extent_page_data epd = {
- .bio = NULL,
- .tree = &BTRFS_I(page->mapping->host)->io_tree,
- .extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
- };
-
- ret = __extent_writepage(page, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
-
- ret = flush_write_bio(&epd);
- ASSERT(ret <= 0);
- return ret;
-}
-
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
{
+ bool found_error = false;
+ int first_error = 0;
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_SIZE) >>
- PAGE_SHIFT;
-
+ u64 cur = start;
+ unsigned long nr_pages;
+ const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
struct extent_page_data epd = {
- .bio = NULL,
- .tree = tree,
+ .bio_ctrl = { 0 },
.extent_locked = 1,
- .sync_io = mode == WB_SYNC_ALL,
+ .sync_io = 1,
};
struct writeback_control wbc_writepages = {
- .sync_mode = mode,
- .nr_to_write = nr_pages * 2,
+ .sync_mode = WB_SYNC_ALL,
.range_start = start,
.range_end = end + 1,
/* We're called from an async helper function */
@@ -4233,125 +3253,119 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.no_cgroup_owner = 1,
};
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+ PAGE_SHIFT;
+ wbc_writepages.nr_to_write = nr_pages * 2;
+
wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
- while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_SHIFT);
- if (clear_page_dirty_for_io(page))
- ret = __extent_writepage(page, &wbc_writepages, &epd);
- else {
- btrfs_writepage_endio_finish_ordered(page, start,
- start + PAGE_SIZE - 1, 1);
- unlock_page(page);
+ while (cur <= end) {
+ u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+ page = find_get_page(mapping, cur >> PAGE_SHIFT);
+ /*
+ * All pages in the range are locked since
+ * btrfs_run_delalloc_range(), thus there is no way to clear
+ * the page dirty flag.
+ */
+ ASSERT(PageLocked(page));
+ ASSERT(PageDirty(page));
+ clear_page_dirty_for_io(page);
+ ret = __extent_writepage(page, &wbc_writepages, &epd);
+ ASSERT(ret <= 0);
+ if (ret < 0) {
+ found_error = true;
+ first_error = ret;
}
put_page(page);
- start += PAGE_SIZE;
+ cur = cur_end + 1;
}
- ASSERT(ret <= 0);
- if (ret == 0)
- ret = flush_write_bio(&epd);
- else
- end_write_bio(&epd, ret);
+ submit_write_bio(&epd, found_error ? ret : 0);
wbc_detach_inode(&wbc_writepages);
+ if (found_error)
+ return first_error;
return ret;
}
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct inode *inode = mapping->host;
int ret = 0;
struct extent_page_data epd = {
- .bio = NULL,
- .tree = &BTRFS_I(mapping->host)->io_tree,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ /*
+ * Allow only a single thread to do the reloc work in zoned mode to
+ * protect the write pointer updates.
+ */
+ btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
- ret = flush_write_bio(&epd);
+ submit_write_bio(&epd, ret);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages)
+void extent_readahead(struct readahead_control *rac)
{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
- int nr = 0;
u64 prev_em_start = (u64)-1;
+ int nr;
- while (!list_empty(pages)) {
- u64 contig_end = 0;
-
- for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
- struct page *page = lru_to_page(pages);
-
- prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping))) {
- put_page(page);
- break;
- }
+ while ((nr = readahead_page_batch(rac, pagepool))) {
+ u64 contig_start = readahead_pos(rac);
+ u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
- pagepool[nr++] = page;
- contig_end = page_offset(page) + PAGE_SIZE - 1;
- }
-
- if (nr) {
- u64 contig_start = page_offset(pagepool[0]);
-
- ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
-
- contiguous_readpages(tree, pagepool, nr, contig_start,
- contig_end, &em_cached, &bio, &bio_flags,
- &prev_em_start);
- }
+ contiguous_readpages(pagepool, nr, contig_start, contig_end,
+ &em_cached, &bio_ctrl, &prev_em_start);
}
if (em_cached)
free_extent_map(em_cached);
-
- if (bio)
- return submit_one_bio(bio, 0, bio_flags);
- return 0;
+ submit_one_bio(&bio_ctrl);
}
/*
- * basic invalidatepage code, this waits on any locked or writeback
- * ranges corresponding to the page, and then deletes any extent state
+ * basic invalidate_folio code, this waits on any locked or writeback
+ * ranges corresponding to the folio, and then deletes any extent state
* records from the tree
*/
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset)
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset)
{
struct extent_state *cached_state = NULL;
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ u64 start = folio_pos(folio);
+ u64 end = start + folio_size(folio) - 1;
+ size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
+
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, &cached_state);
- wait_on_page_writeback(page);
- clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
+ lock_extent(tree, start, end, &cached_state);
+ folio_wait_writeback(folio);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ unlock_extent(tree, start, end, &cached_state);
return 0;
}
/*
- * a helper for releasepage, this tests for areas of the page that
+ * a helper for release_folio, this tests for areas of the page that
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
@@ -4365,13 +3379,17 @@ static int try_release_extent_state(struct extent_io_tree *tree,
if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
ret = 0;
} else {
+ u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+
/*
- * at this point we can safely clear everything except the
- * locked bit and the nodatasum bit
+ * At this point we can safely clear everything except the
+ * locked bit, the nodatasum bit and the delalloc new bit.
+ * The delalloc new bit will be cleared by ordered extent
+ * completion.
*/
- ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask, NULL);
+ ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
+ mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4385,7 +3403,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,
}
/*
- * a helper for releasepage. As long as there are no locked extents
+ * a helper for release_folio. As long as there are no locked extents
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
@@ -4402,6 +3420,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
+ struct btrfs_fs_info *fs_info;
+ u64 cur_gen;
+
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
@@ -4415,62 +3436,58 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
- if (!test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL)) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &btrfs_inode->runtime_flags);
- remove_extent_mapping(map, em);
- /* once for the rb tree */
- free_extent_map(em);
- }
+ if (test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED, 0, NULL))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used
+ * by a fast fsync, we can remove it. If it's being
+ * logged we can safely remove it since fsync took an
+ * extra reference on the em.
+ */
+ if (list_empty(&em->list) ||
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ goto remove_em;
+ /*
+ * If it's in the list of modified extents, remove it
+ * only if its generation is older then the current one,
+ * in which case we don't need it for a fast fsync.
+ * Otherwise don't remove it, we could be racing with an
+ * ongoing fast fsync that could miss the new extent.
+ */
+ fs_info = btrfs_inode->root->fs_info;
+ spin_lock(&fs_info->trans_lock);
+ cur_gen = fs_info->generation;
+ spin_unlock(&fs_info->trans_lock);
+ if (em->generation >= cur_gen)
+ goto next;
+remove_em:
+ /*
+ * We only remove extent maps that are not in the list of
+ * modified extents or that are in the list but with a
+ * generation lower then the current generation, so there
+ * is no need to set the full fsync flag on the inode (it
+ * hurts the fsync performance for workloads with a data
+ * size that exceeds or is close to the system's memory).
+ */
+ remove_extent_mapping(map, em);
+ /* once for the rb tree */
+ free_extent_map(em);
+next:
start = extent_map_end(em);
write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
}
/*
- * helper function for fiemap, which doesn't want to see any holes.
- * This maps until we find something past 'last'
- */
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
- u64 offset, u64 last)
-{
- u64 sectorsize = btrfs_inode_sectorsize(inode);
- struct extent_map *em;
- u64 len;
-
- if (offset >= last)
- return NULL;
-
- while (1) {
- len = last - offset;
- if (len == 0)
- break;
- len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
- if (IS_ERR_OR_NULL(em))
- return em;
-
- /* if this isn't a hole return it */
- if (em->block_start != EXTENT_MAP_HOLE)
- return em;
-
- /* this is a hole, advance to the next extent */
- offset = extent_map_end(em);
- free_extent_map(em);
- if (offset >= last)
- break;
- }
- return NULL;
-}
-
-/*
* To cache previous fiemap extent
*
* Will be used for merging fiemap extent
@@ -4499,6 +3516,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
{
int ret = 0;
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
if (!cache->cached)
goto assign;
@@ -4522,16 +3542,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* So truly compressed (physical size smaller than logical size)
* extents won't get merged with each other
*
- * 3) Share same flags except FIEMAP_EXTENT_LAST
- * So regular extent won't get merged with prealloc extent
+ * 3) Share same flags
*/
if (cache->offset + cache->len == offset &&
cache->phys + cache->len == phys &&
- (cache->flags & ~FIEMAP_EXTENT_LAST) ==
- (flags & ~FIEMAP_EXTENT_LAST)) {
+ cache->flags == flags) {
cache->len += len;
- cache->flags |= flags;
- goto try_submit_last;
+ return 0;
}
/* Not mergeable, need to submit cached one */
@@ -4546,13 +3563,8 @@ assign:
cache->phys = phys;
cache->len = len;
cache->flags = flags;
-try_submit_last:
- if (cache->flags & FIEMAP_EXTENT_LAST) {
- ret = fiemap_fill_next_extent(fieinfo, cache->offset,
- cache->phys, cache->len, cache->flags);
- cache->cached = false;
- }
- return ret;
+
+ return 0;
}
/*
@@ -4582,212 +3594,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
- int ret = 0;
- u64 off = start;
- u64 max = start + len;
- u32 flags = 0;
- u32 found_type;
- u64 last;
- u64 last_for_get_extent = 0;
- u64 disko = 0;
- u64 isize = i_size_read(inode);
- struct btrfs_key found_key;
- struct extent_map *em = NULL;
- struct extent_state *cached_state = NULL;
- struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct fiemap_cache cache = { 0 };
- struct ulist *roots;
- struct ulist *tmp_ulist;
- int end = 0;
- u64 em_start = 0;
- u64 em_len = 0;
- u64 em_end = 0;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
- if (len == 0)
- return -EINVAL;
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
- path = btrfs_alloc_path();
- if (!path)
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
return -ENOMEM;
- path->leave_spinning = 1;
- roots = ulist_alloc(GFP_KERNEL);
- tmp_ulist = ulist_alloc(GFP_KERNEL);
- if (!roots || !tmp_ulist) {
- ret = -ENOMEM;
- goto out_free_ulist;
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
}
- start = round_down(start, btrfs_inode_sectorsize(inode));
- len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+ }
/*
- * lookup the last file extent. We're not using i_size here
- * because there might be preallocation past i_size
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we call fiemap_fill_next_extent(), because that may cause a page
+ * fault when filling the user space buffer with fiemap data.
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
- if (ret < 0) {
- goto out_free_ulist;
- } else {
- WARN_ON(!ret);
- if (ret == 1)
- ret = 0;
- }
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct btrfs_backref_shared_cache *backref_cache,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp_ulist,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
+
+ /*
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
+ */
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = found_key.type;
-
- /* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
- /* have to trust i_size as the end */
- last = (u64)-1;
- last_for_get_extent = isize;
- } else {
/*
- * remember the start of the last extent. There are a
- * bunch of different factors that go into the length of the
- * extent, so its much less complex to remember where it started
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
*/
- last = found_key.offset;
- last_for_get_extent = last + 1;
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
}
- btrfs_release_path(path);
/*
- * we might have some extents allocated but more delalloc past those
- * extents. so, we trust isize unless the start of the last extent is
- * beyond isize
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
*/
- if (last < isize) {
- last = (u64)-1;
- last_for_get_extent = isize;
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- &cached_state);
+ return 0;
+}
- em = get_extent_skip_holes(inode, start, last_for_get_extent);
- if (!em)
- goto out;
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_shared_cache *backref_cache;
+ struct ulist *roots;
+ struct ulist *tmp_ulist;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 lockstart;
+ u64 lockend;
+ bool stopped = false;
+ int ret;
+
+ backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+ path = btrfs_alloc_path();
+ roots = ulist_alloc(GFP_KERNEL);
+ tmp_ulist = ulist_alloc(GFP_KERNEL);
+ if (!backref_cache || !path || !roots || !tmp_ulist) {
+ ret = -ENOMEM;
goto out;
}
- while (!end) {
- u64 offset_in_extent = 0;
+ lockstart = round_down(start, root->fs_info->sectorsize);
+ lockend = round_up(start + len, root->fs_info->sectorsize);
+ prev_extent_end = lockstart;
- /* break if the extent we found is outside the range */
- if (em->start >= max || extent_map_end(em) < off)
- break;
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- /*
- * get_extent may return an extent that starts before our
- * requested range. We have to make sure the ranges
- * we return to fiemap always move forward and don't
- * overlap, so adjust the offsets here
- */
- em_start = max(em->start, off);
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, lockstart);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
/*
- * record the offset from the start of the extent
- * for adjusting the disk offset below. Only do this if the
- * extent isn't compressed since our in ram offset may be past
- * what we have actually allocated on disk.
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
*/
- if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- offset_in_extent = em_start - em->start;
- em_end = extent_map_end(em);
- em_len = em_end - em_start;
- flags = 0;
- if (em->block_start < EXTENT_MAP_LAST_BYTE)
- disko = em->block_start + offset_in_extent;
- else
- disko = 0;
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < lockend) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
/*
- * bump off for our next call to get_extent
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
*/
- off = extent_map_end(em);
- if (off >= max)
- end = 1;
-
- if (em->block_start == EXTENT_MAP_LAST_BYTE) {
- end = 1;
- flags |= FIEMAP_EXTENT_LAST;
- } else if (em->block_start == EXTENT_MAP_INLINE) {
- flags |= (FIEMAP_EXTENT_DATA_INLINE |
- FIEMAP_EXTENT_NOT_ALIGNED);
- } else if (em->block_start == EXTENT_MAP_DELALLOC) {
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- } else if (fieinfo->fi_extents_max) {
- u64 bytenr = em->block_start -
- (em->start - em->orig_start);
+ if (extent_end <= lockstart)
+ goto next_item;
- /*
- * As btrfs supports shared space, this information
- * can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
- * then we're just getting a count and we can skip the
- * lookup stuff.
- */
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
- bytenr, roots, tmp_ulist);
- if (ret < 0)
- goto out_free;
- if (ret)
- flags |= FIEMAP_EXTENT_SHARED;
- ret = 0;
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 range_end = min(key.offset, lockend) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ prev_extent_end, range_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= lockend) {
+ stopped = true;
+ break;
+ }
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
flags |= FIEMAP_EXTENT_ENCODED;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- free_extent_map(em);
- em = NULL;
- if ((em_start >= last) || em_len == (u64)-1 ||
- (last == (u64)-1 && isize <= em_end)) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache,
+ disk_bytenr, extent_offset,
+ extent_gen, roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(root, ino,
+ disk_bytenr,
+ extent_gen,
+ roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
}
- /* now scan forward to see if this is really the last extent. */
- em = get_extent_skip_holes(inode, off, last_for_get_extent);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
}
- if (!em) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
}
- ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
- em_len, flags);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out_free;
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
}
+ cond_resched();
}
-out_free:
- if (!ret)
- ret = emit_last_fiemap_cache(fieinfo, &cache);
- free_extent_map(em);
-out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- &cached_state);
-out_free_ulist:
+check_eof_delalloc:
+ /*
+ * Release (and free) the path before emitting any final entries to
+ * fiemap_fill_next_extent() to keep lockdep happy. This is because
+ * once we find no more file extent items exist, we may have a
+ * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+ * faults when copying data to the user space buffer.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ if (!stopped && prev_extent_end < lockend) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+ 0, 0, 0, roots, tmp_ulist,
+ prev_extent_end, lockend - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = lockend;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+out:
+ kfree(backref_cache);
btrfs_free_path(path);
ulist_free(roots);
ulist_free(tmp_ulist);
@@ -4796,36 +4130,55 @@ out_free_ulist:
static void __free_extent_buffer(struct extent_buffer *eb)
{
- btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
-int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-/*
- * Release all pages attached to the extent buffer.
- */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
{
- int i;
- int num_pages;
- int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+ struct btrfs_subpage *subpage;
- BUG_ON(extent_buffer_under_io(eb));
+ lockdep_assert_held(&page->mapping->private_lock);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ if (PagePrivate(page)) {
+ subpage = (struct btrfs_subpage *)page->private;
+ if (atomic_read(&subpage->eb_refs))
+ return true;
+ /*
+ * Even there is no eb refs here, we may still have
+ * end_page_read() call relying on page::private.
+ */
+ if (atomic_read(&subpage->readers))
+ return true;
+ }
+ return false;
+}
- if (!page)
- continue;
+static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+ /*
+ * For mapped eb, we're going to change the page private, which should
+ * be done under the private_lock.
+ */
+ if (mapped)
+ spin_lock(&page->mapping->private_lock);
+
+ if (!PagePrivate(page)) {
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ if (fs_info->nodesize >= PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -4842,14 +4195,51 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
* We need to make sure we haven't be attached
* to a new eb.
*/
- ClearPagePrivate(page);
- set_page_private(page, 0);
- /* One for the page private */
- put_page(page);
+ detach_page_private(page);
}
-
if (mapped)
spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ /*
+ * For subpage, we can have dummy eb with page private. In this case,
+ * we can directly detach the private as such page is only attached to
+ * one dummy eb, no sharing.
+ */
+ if (!mapped) {
+ btrfs_detach_subpage(fs_info, page);
+ return;
+ }
+
+ btrfs_page_dec_eb_refs(fs_info, page);
+
+ /*
+ * We can only detach the page private if there are no other ebs in the
+ * page range and no unfinished IO.
+ */
+ if (!page_range_has_eb(fs_info, page))
+ btrfs_detach_subpage(fs_info, page);
+
+ spin_unlock(&page->mapping->private_lock);
+}
+
+/* Release all pages attached to the extent buffer */
+static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+
+ ASSERT(!extent_buffer_under_io(eb));
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = eb->pages[i];
+
+ if (!page)
+ continue;
+
+ detach_extent_buffer_page(eb, page);
/* One for when we allocated the page */
put_page(page);
@@ -4862,6 +4252,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
+ btrfs_leak_debug_del_eb(eb);
__free_extent_buffer(eb);
}
@@ -4876,62 +4267,58 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->len = len;
eb->fs_info = fs_info;
eb->bflags = 0;
- rwlock_init(&eb->lock);
- atomic_set(&eb->blocking_readers, 0);
- eb->blocking_writers = 0;
- eb->lock_nested = false;
- init_waitqueue_head(&eb->write_lock_wq);
- init_waitqueue_head(&eb->read_lock_wq);
+ init_rwsem(&eb->lock);
- btrfs_leak_debug_add(&eb->leak_list, &buffers);
+ btrfs_leak_debug_add_eb(eb);
+ INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- /*
- * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
- */
- BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
- > MAX_INLINE_EXTENT_BUFFER_SIZE);
- BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
-
-#ifdef CONFIG_BTRFS_DEBUG
- eb->spinning_writers = 0;
- atomic_set(&eb->spinning_readers, 0);
- atomic_set(&eb->read_locks, 0);
- eb->write_locks = 0;
-#endif
+ ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
-struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
- struct page *p;
struct extent_buffer *new;
int num_pages = num_extent_pages(src);
+ int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
return NULL;
+ /*
+ * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
+ * btrfs_release_extent_buffer() have different behavior for
+ * UNMAPPED subpage extent buffer.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+
+ memset(new->pages, 0, sizeof(*new->pages) * num_pages);
+ ret = btrfs_alloc_page_array(num_pages, new->pages);
+ if (ret) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+
for (i = 0; i < num_pages; i++) {
- p = alloc_page(GFP_NOFS);
- if (!p) {
+ int ret;
+ struct page *p = new->pages[i];
+
+ ret = attach_extent_buffer_page(new, p, NULL);
+ if (ret < 0) {
btrfs_release_extent_buffer(new);
return NULL;
}
- attach_extent_buffer_page(new, p);
WARN_ON(PageDirty(p));
- SetPageUptodate(p);
- new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
-
- set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
- set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ set_extent_buffer_uptodate(new);
return new;
}
@@ -4942,25 +4329,37 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int num_pages;
int i;
+ int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
num_pages = num_extent_pages(eb);
+ ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ if (ret)
+ goto err;
+
for (i = 0; i < num_pages; i++) {
- eb->pages[i] = alloc_page(GFP_NOFS);
- if (!eb->pages[i])
+ struct page *p = eb->pages[i];
+
+ ret = attach_extent_buffer_page(eb, p, NULL);
+ if (ret < 0)
goto err;
}
+
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
err:
- for (; i > 0; i--)
- __free_page(eb->pages[i - 1]);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i]) {
+ detach_extent_buffer_page(eb, eb->pages[i]);
+ __free_page(eb->pages[i]);
+ }
+ }
__free_extent_buffer(eb);
return NULL;
}
@@ -4974,25 +4373,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling release_folio when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, release_folio can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5025,36 +4427,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_SHIFT);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- /*
- * Lock our eb's refs_lock to avoid races with
- * free_extent_buffer. When we get our eb it might be flagged
- * with EXTENT_BUFFER_STALE and another task running
- * free_extent_buffer might have seen that flag set,
- * eb->refs == 2, that the buffer isn't under IO (dirty and
- * writeback flags not set) and it's still in the tree (flag
- * EXTENT_BUFFER_TREE_REF set), therefore being in the process
- * of decrementing the extent buffer's reference count twice.
- * So here we could race and increment the eb's reference count,
- * clear its stale flag, mark it as dirty and drop our reference
- * before the other task finishes executing free_extent_buffer,
- * which would later result in an attempt to free an extent
- * buffer that is dirty.
- */
- if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
- spin_lock(&eb->refs_lock);
- spin_unlock(&eb->refs_lock);
- }
- mark_extent_buffer_accessed(eb, NULL);
- return eb;
+ eb = find_extent_buffer_nolock(fs_info, start);
+ if (!eb)
+ return NULL;
+ /*
+ * Lock our eb's refs_lock to avoid races with free_extent_buffer().
+ * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
+ * another task running free_extent_buffer() might have seen that flag
+ * set, eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
+ * decrementing the extent buffer's reference count twice. So here we
+ * could race and increment the eb's reference count, clear its stale
+ * flag, mark it as dirty and drop our reference before the other task
+ * finishes executing free_extent_buffer, which would later result in
+ * an attempt to free an extent buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
}
- rcu_read_unlock();
-
- return NULL;
+ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5079,7 +4473,7 @@ again:
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5099,8 +4493,64 @@ free_eb:
}
#endif
+static struct extent_buffer *grab_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page)
+{
+ struct extent_buffer *exists;
+
+ /*
+ * For subpage case, we completely rely on radix tree to ensure we
+ * don't try to insert two ebs for the same bytenr. So here we always
+ * return NULL and just continue.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return NULL;
+
+ /* Page not yet attached to an extent buffer */
+ if (!PagePrivate(page))
+ return NULL;
+
+ /*
+ * We could have already allocated an eb for this page and attached one
+ * so lets see if we can get a ref on the existing eb, and if we can we
+ * know it's good and we can just return that one, else we know we can
+ * just overwrite page->private.
+ */
+ exists = (struct extent_buffer *)page->private;
+ if (atomic_inc_not_zero(&exists->refs))
+ return exists;
+
+ WARN_ON(PageDirty(page));
+ detach_page_private(page);
+ return NULL;
+}
+
+static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+{
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return -EINVAL;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE &&
+ offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ if (fs_info->nodesize >= PAGE_SIZE &&
+ !PAGE_ALIGNED(start)) {
+ btrfs_err(fs_info,
+ "tree block is not page aligned, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
int num_pages;
@@ -5110,13 +4560,23 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *exists = NULL;
struct page *p;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ u64 lockdep_owner = owner_root;
int uptodate = 1;
int ret;
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
- btrfs_err(fs_info, "bad tree block start %llu", start);
+ if (check_eb_alignment(fs_info, start))
return ERR_PTR(-EINVAL);
+
+#if BITS_PER_LONG == 32
+ if (start >= MAX_LFS_FILESIZE) {
+ btrfs_err_rl(fs_info,
+ "extent buffer %llu is beyond 32bit page cache limit", start);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
}
+ if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ btrfs_warn_32bit_limit(fs_info);
+#endif
eb = find_extent_buffer(fs_info, start);
if (eb)
@@ -5126,44 +4586,72 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return ERR_PTR(-ENOMEM);
+ /*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
+ struct btrfs_subpage *prealloc = NULL;
+
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
goto free_eb;
}
- spin_lock(&mapping->private_lock);
- if (PagePrivate(p)) {
- /*
- * We could have already allocated an eb for this page
- * and attached one so lets see if we can get a ref on
- * the existing eb, and if we can we know it's good and
- * we can just return that one, else we know we can just
- * overwrite page->private.
- */
- exists = (struct extent_buffer *)p->private;
- if (atomic_inc_not_zero(&exists->refs)) {
- spin_unlock(&mapping->private_lock);
+ /*
+ * Preallocate page->private for subpage case, so that we won't
+ * allocate memory with private_lock hold. The memory will be
+ * freed by attach_extent_buffer_page() or freed manually if
+ * we exit earlier.
+ *
+ * Although we have ensured one subpage eb can only have one
+ * page, but it may change in the future for 16K page size
+ * support, so we still preallocate the memory in the loop.
+ */
+ if (fs_info->nodesize < PAGE_SIZE) {
+ prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (IS_ERR(prealloc)) {
+ ret = PTR_ERR(prealloc);
unlock_page(p);
put_page(p);
- mark_extent_buffer_accessed(exists, p);
+ exists = ERR_PTR(ret);
goto free_eb;
}
- exists = NULL;
+ }
- /*
- * Do this so attach doesn't complain and we need to
- * drop the ref the old guy had.
- */
- ClearPagePrivate(p);
- WARN_ON(PageDirty(p));
+ spin_lock(&mapping->private_lock);
+ exists = grab_extent_buffer(fs_info, p);
+ if (exists) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
put_page(p);
+ mark_extent_buffer_accessed(exists, p);
+ btrfs_free_subpage(prealloc);
+ goto free_eb;
}
- attach_extent_buffer_page(eb, p);
+ /* Should not fail, as we have preallocated the memory */
+ ret = attach_extent_buffer_page(eb, p, prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the page private
+ * when the eb hasn't yet been inserted into radix tree.
+ *
+ * The ref will be decreased when the eb released the page, in
+ * detach_extent_buffer_page().
+ * Thus needs no special handling in error path.
+ */
+ btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
- WARN_ON(PageDirty(p));
+
+ WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -5171,7 +4659,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
/*
* We can't unlock the pages just yet since the extent buffer
* hasn't been properly inserted in the radix tree, this
- * opens a race with btree_releasepage which can free a page
+ * opens a race with btree_release_folio which can free a page
* while we are still filling in all pages for the buffer and
* we could crash.
*/
@@ -5187,7 +4675,7 @@ again:
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5203,7 +4691,7 @@ again:
/*
* Now it's safe to unlock the pages because any calls to
- * btree_releasepage will correctly detect that a page belongs to a
+ * btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
for (i = 0; i < num_pages; i++)
@@ -5230,6 +4718,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
}
static int release_extent_buffer(struct extent_buffer *eb)
+ __releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
@@ -5242,12 +4731,13 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_SHIFT);
+ eb->start >> fs_info->sectorsize_bits);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
}
+ btrfs_leak_debug_del_eb(eb);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5267,18 +4757,16 @@ static int release_extent_buffer(struct extent_buffer *eb)
void free_extent_buffer(struct extent_buffer *eb)
{
int refs;
- int old;
if (!eb)
return;
+ refs = atomic_read(&eb->refs);
while (1) {
- refs = atomic_read(&eb->refs);
if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
|| (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
refs == 1))
break;
- old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
- if (old == refs)
+ if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
return;
}
@@ -5310,28 +4798,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-void clear_extent_buffer_dirty(struct extent_buffer *eb)
+static void btree_clear_page_dirty(struct page *page)
+{
+ ASSERT(PageDirty(page));
+ ASSERT(PageLocked(page));
+ clear_page_dirty_for_io(page);
+ xa_lock_irq(&page->mapping->i_pages);
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&page->mapping->i_pages);
+}
+
+static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ bool last;
+
+ /* btree_clear_page_dirty() needs page locked */
+ lock_page(page);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
+ eb->len);
+ if (last)
+ btree_clear_page_dirty(page);
+ unlock_page(page);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
+void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
struct page *page;
+ if (eb->fs_info->nodesize < PAGE_SIZE)
+ return clear_subpage_extent_buffer_dirty(eb);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (!PageDirty(page))
continue;
-
lock_page(page);
- WARN_ON(!PagePrivate(page));
-
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ btree_clear_page_dirty(page);
ClearPageError(page);
unlock_page(page);
}
@@ -5352,10 +4863,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
- if (!was_dirty)
- for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ if (!was_dirty) {
+ bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+ /*
+ * For subpage case, we can have other extent buffers in the
+ * same page, and in clear_subpage_extent_buffer_dirty() we
+ * have to clear page dirty without subpage lock held.
+ * This can cause race where our page gets dirty cleared after
+ * we just set it.
+ *
+ * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * its page for other reasons, we can use page lock to prevent
+ * the above race.
+ */
+ if (subpage)
+ lock_page(eb->pages[0]);
+ for (i = 0; i < num_pages; i++)
+ btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
+ eb->start, eb->len);
+ if (subpage)
+ unlock_page(eb->pages[0]);
+ }
#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
ASSERT(PageDirty(eb->pages[i]));
@@ -5366,33 +4895,117 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- if (page)
+ if (!page)
+ continue;
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
ClearPageUptodate(page);
+ else
+ btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- SetPageUptodate(page);
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ SetPageUptodate(page);
+ else
+ btrfs_subpage_set_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
+static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct extent_io_tree *io_tree;
+ struct page *page = eb->pages[0];
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
+ int ret = 0;
+
+ ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
+ ASSERT(PagePrivate(page));
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+
+ if (wait == WAIT_NONE) {
+ if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+ return -EAGAIN;
+ } else {
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = 0;
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
+ PageUptodate(page) ||
+ btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
+ return ret;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, 1);
+ check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
+
+ btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+
+ btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
+ ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
+ eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, true);
+ if (ret) {
+ /*
+ * In the endio function, if we hit something wrong we will
+ * increase the io_pages, so here we need to decrease it for
+ * error path.
+ */
+ atomic_dec(&eb->io_pages);
+ }
+ submit_one_bio(&bio_ctrl);
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ ret = -EIO;
+ return ret;
+}
+
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
{
int i;
@@ -5403,17 +5016,35 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
+ /*
+ * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
+ * operation, which could potentially still be in flight. In this case
+ * we simply want to return an error.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
+ return -EIO;
+
+ if (eb->fs_info->nodesize < PAGE_SIZE)
+ return read_extent_buffer_subpage(eb, wait, mirror_num);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (wait == WAIT_NONE) {
+ /*
+ * WAIT_NONE is only utilized by readahead. If we can't
+ * acquire the lock atomically it means either the eb
+ * is being read out or under modification.
+ * Either way the eb will be or has been cached,
+ * readahead can exit safely.
+ */
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -5442,6 +5073,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for release_folio to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -5453,20 +5090,18 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(tree, page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ, NULL,
+ &bio_ctrl, page_offset(page), page,
+ PAGE_SIZE, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5474,11 +5109,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
- if (bio) {
- err = submit_one_bio(bio, mirror_num, bio_flags);
- if (err)
- return err;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -5501,6 +5132,36 @@ unlock_exit:
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5509,17 +5170,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
- offset = offset_in_page(start_offset + start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5535,30 +5191,29 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start_offset + start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5572,48 +5227,6 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
return ret;
}
-/*
- * return 0 if the item is found within a page.
- * return 1 if the item spans two pages.
- * return -EINVAL otherwise.
- */
-int map_private_extent_buffer(const struct extent_buffer *eb,
- unsigned long start, unsigned long min_len,
- char **map, unsigned long *map_start,
- unsigned long *map_len)
-{
- size_t offset;
- char *kaddr;
- struct page *p;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
- unsigned long end_i = (start_offset + start + min_len - 1) >>
- PAGE_SHIFT;
-
- if (start + min_len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, min_len);
- return -EINVAL;
- }
-
- if (i != end_i)
- return 1;
-
- if (i == 0) {
- offset = start_offset;
- *map_start = 0;
- } else {
- offset = 0;
- *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
- }
-
- p = eb->pages[i];
- kaddr = page_address(p);
- *map = kaddr + offset;
- *map_len = PAGE_SIZE - offset;
- return 0;
-}
-
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
@@ -5622,14 +5235,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
- offset = offset_in_page(start_offset + start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5649,28 +5261,61 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
return ret;
}
-void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+/*
+ * Check that the extent buffer is uptodate.
+ *
+ * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
+ * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
+ */
+static void assert_eb_page_uptodate(const struct extent_buffer *eb,
+ struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ /*
+ * If we are using the commit root we could potentially clear a page
+ * Uptodate while we're using the extent buffer that we've previously
+ * looked up. We don't want to complain in this case, as the page was
+ * valid before, we just didn't write it out. Instead we want to catch
+ * the case where we didn't actually read the block properly, which
+ * would have !PageUptodate && !PageError, as we clear PageError before
+ * reading.
+ */
+ if (fs_info->nodesize < PAGE_SIZE) {
+ bool uptodate, error;
+
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page,
+ eb->start, eb->len);
+ error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
+ WARN_ON(!uptodate && !error);
+ } else {
+ WARN_ON(!PageUptodate(page) && !PageError(page));
+ }
+}
+
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
- WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
- memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
- BTRFS_FSID_SIZE);
+ assert_eb_page_uptodate(eb, eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
+ chunk_tree_uuid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
-void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
- WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
- memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
- BTRFS_FSID_SIZE);
+ assert_eb_page_uptodate(eb, eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
-void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
unsigned long start, unsigned long len)
{
size_t cur;
@@ -5678,17 +5323,18 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
- offset = offset_in_page(start_offset + start);
+ if (check_eb_range(eb, start, len))
+ return;
+
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@@ -5701,24 +5347,23 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
}
}
-void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = offset_in_page(eb->start);
- unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
- offset = offset_in_page(start_offset + start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@@ -5730,21 +5375,32 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
}
}
-void copy_extent_buffer_full(struct extent_buffer *dst,
- struct extent_buffer *src)
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src)
{
int i;
int num_pages;
ASSERT(dst->len == src->len);
- num_pages = num_extent_pages(dst);
- for (i = 0; i < num_pages; i++)
- copy_page(page_address(dst->pages[i]),
- page_address(src->pages[i]));
+ if (dst->fs_info->nodesize >= PAGE_SIZE) {
+ num_pages = num_extent_pages(dst);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+ } else {
+ size_t src_offset = get_eb_offset_in_page(src, 0);
+ size_t dst_offset = get_eb_offset_in_page(dst, 0);
+
+ ASSERT(src->fs_info->nodesize < PAGE_SIZE);
+ memcpy(page_address(dst->pages[0]) + dst_offset,
+ page_address(src->pages[0]) + src_offset,
+ src->len);
+ }
}
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
@@ -5753,16 +5409,19 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = offset_in_page(dst->start);
- unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(dst_offset);
+
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
WARN_ON(src->len != dst_len);
- offset = offset_in_page(start_offset + dst_offset);
+ offset = get_eb_offset_in_page(dst, dst_offset);
while (len > 0) {
page = dst->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(dst, page);
cur = min(len, (unsigned long)(PAGE_SIZE - offset));
@@ -5789,12 +5448,11 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
-static inline void eb_bitmap_offset(struct extent_buffer *eb,
+static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
unsigned long *page_index,
size_t *page_offset)
{
- size_t start_offset = offset_in_page(eb->start);
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -5803,7 +5461,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start_offset + start + byte_offset;
+ offset = start + offset_in_page(eb->start) + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -5815,7 +5473,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
u8 *kaddr;
@@ -5825,7 +5483,7 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
eb_bitmap_offset(eb, start, nr, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
@@ -5837,7 +5495,7 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
* @pos: bit number of the first bit
* @len: number of bits to set
*/
-void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *kaddr;
@@ -5850,7 +5508,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_set) {
@@ -5861,7 +5519,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
@@ -5879,8 +5537,9 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
* @pos: bit number of the first bit
* @len: number of bits to clear
*/
-void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- unsigned long pos, unsigned long len)
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len)
{
u8 *kaddr;
struct page *page;
@@ -5892,7 +5551,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_clear) {
@@ -5903,7 +5562,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
@@ -5941,36 +5600,26 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len)
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
- dst_off_in_page = offset_in_page(start_offset + dst_offset);
- src_off_in_page = offset_in_page(start_offset + src_offset);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
+ src_off_in_page = get_eb_offset_in_page(dst, src_offset);
- dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
- src_i = (start_offset + src_offset) >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_offset);
+ src_i = get_eb_page_index(src_offset);
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -5986,41 +5635,31 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len)
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = offset_in_page(dst->start);
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
while (len > 0) {
- dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
- src_i = (start_offset + src_end) >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_end);
+ src_i = get_eb_page_index(src_end);
- dst_off_in_page = offset_in_page(start_offset + dst_end);
- src_off_in_page = offset_in_page(start_offset + src_end);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
+ src_off_in_page = get_eb_offset_in_page(dst, src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
@@ -6034,13 +5673,124 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
}
}
+#define GANG_LOOKUP_SIZE 16
+static struct extent_buffer *get_next_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
+ struct extent_buffer *found = NULL;
+ u64 page_start = page_offset(page);
+ u64 cur = page_start;
+
+ ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ lockdep_assert_held(&fs_info->buffer_lock);
+
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
+ }
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
+ }
+out:
+ return found;
+}
+
+static int try_release_subpage_extent_buffer(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ u64 cur = page_offset(page);
+ const u64 end = page_offset(page) + PAGE_SIZE;
+ int ret;
+
+ while (cur < end) {
+ struct extent_buffer *eb = NULL;
+
+ /*
+ * Unlike try_release_extent_buffer() which uses page->private
+ * to grab buffer, for subpage case we rely on radix tree, thus
+ * we need to ensure radix tree consistency.
+ *
+ * We also want an atomic snapshot of the radix tree, thus go
+ * with spinlock rather than RCU.
+ */
+ spin_lock(&fs_info->buffer_lock);
+ eb = get_next_extent_buffer(fs_info, page, cur);
+ if (!eb) {
+ /* No more eb in the page range after or at cur */
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ cur = eb->start + eb->len;
+
+ /*
+ * The same as try_release_extent_buffer(), to ensure the eb
+ * won't disappear out from under us.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a
+ * real ref, so just return, this eb will likely be freed soon
+ * anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ break;
+ }
+
+ /*
+ * Here we don't care about the return value, we will always
+ * check the page private at the end. And
+ * release_extent_buffer() will release the refs_lock.
+ */
+ release_extent_buffer(eb);
+ }
+ /*
+ * Finally to check if we have cleared page private, as if we have
+ * released all ebs in the page, the page private should be cleared now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page))
+ ret = 1;
+ else
+ ret = 0;
+ spin_unlock(&page->mapping->private_lock);
+ return ret;
+
+}
+
int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(page);
+
/*
- * We need to make sure nobody is attaching this page to an eb right
- * now.
+ * We need to make sure nobody is changing page->private, as we rely on
+ * page->private as the pointer to extent buffer.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
@@ -6075,3 +5825,54 @@ int try_release_extent_buffer(struct page *page)
return release_extent_buffer(eb);
}
+
+/*
+ * btrfs_readahead_tree_block - attempt to readahead a child block
+ * @fs_info: the fs_info
+ * @bytenr: bytenr to read
+ * @owner_root: objectid of the root that owns this eb
+ * @gen: generation for the uptodate check, can be 0
+ * @level: level for the eb
+ *
+ * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
+ * normal uptodate check of the eb, without checking the generation. If we have
+ * to read the block we will not block on anything.
+ */
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level)
+{
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(eb))
+ return;
+
+ if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(eb);
+ else
+ free_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_node_child - readahead a node's child block
+ * @node: parent node we're reading from
+ * @slot: slot in the parent node for the child we want to read
+ *
+ * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
+ * the slot in the node provided.
+ */
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
+{
+ btrfs_readahead_tree_block(node->fs_info,
+ btrfs_node_blockptr(node, slot),
+ btrfs_header_owner(node),
+ btrfs_node_ptr_generation(node, slot),
+ btrfs_header_level(node) - 1);
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5d205bbaafdc..7929f054dda3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -5,15 +5,11 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
+#include <linux/fiemap.h>
+#include <linux/btrfs_tree.h>
+#include "compression.h"
#include "ulist.h"
-/*
- * flags for bio submission. The high bits indicate the compression
- * type for this bio
- */
-#define EXTENT_BIO_COMPRESSED 1
-#define EXTENT_BIO_FLAG_SHIFT 16
-
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
@@ -29,16 +25,17 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
+ EXTENT_BUFFER_NO_CHECK,
};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
-#define PAGE_CLEAR_DIRTY (1 << 1)
-#define PAGE_SET_WRITEBACK (1 << 2)
-#define PAGE_END_WRITEBACK (1 << 3)
-#define PAGE_SET_PRIVATE2 (1 << 4)
-#define PAGE_SET_ERROR (1 << 5)
-#define PAGE_LOCK (1 << 6)
+/* Page starts writeback, clear dirty bit and set writeback bit */
+#define PAGE_START_WRITEBACK (1 << 1)
+#define PAGE_END_WRITEBACK (1 << 2)
+#define PAGE_SET_ORDERED (1 << 3)
+#define PAGE_SET_ERROR (1 << 4)
+#define PAGE_LOCK (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
@@ -60,30 +57,24 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
-struct btrfs_io_bio;
+struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
-typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
- struct bio *bio, u64 bio_offset);
+int __init extent_buffer_init_cachep(void);
+void __cold extent_buffer_free_cachep(void);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
+typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type);
+typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
+ struct bio *bio, u64 dio_file_offset);
-#define INLINE_EXTENT_BUFFER_PAGES 16
-#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
+#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
@@ -95,31 +86,14 @@ struct extent_buffer {
int read_mirror;
struct rcu_head rcu_head;
pid_t lock_owner;
-
- int blocking_writers;
- atomic_t blocking_readers;
- bool lock_nested;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
- short log_index;
-
- /* protects write locks */
- rwlock_t lock;
+ s8 log_index;
- /* readers use lock_wq while they wait for the write
- * lock holders to unlock
- */
- wait_queue_head_t write_lock_wq;
+ struct rw_semaphore lock;
- /* writers use read_lock_wq while they wait for readers
- * to unlock
- */
- wait_queue_head_t read_lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct list_head release_list;
#ifdef CONFIG_BTRFS_DEBUG
- int spinning_writers;
- atomic_t spinning_readers;
- atomic_t read_locks;
- int write_locks;
struct list_head leak_list;
#endif
};
@@ -129,7 +103,7 @@ struct extent_buffer {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
@@ -169,48 +143,30 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-static inline void extent_set_compress_type(unsigned long *bio_flags,
- int compress_type)
-{
- *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
-}
-
-static inline int extent_compress_type(unsigned long bio_flags)
-{
- return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
-}
-
struct extent_map_tree;
-typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
-
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num);
-int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
- int mode);
+int btrfs_read_folio(struct file *file, struct folio *folio);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
-int extent_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
-void set_page_extent_mapped(struct page *page);
+void extent_readahead(struct readahead_control *rac);
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+int set_page_extent_mapped(struct page *page);
+void clear_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u64 owner_root, int level);
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
-struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
+struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
void free_extent_buffer(struct extent_buffer *eb);
@@ -221,14 +177,23 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level);
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
static inline int num_extent_pages(const struct extent_buffer *eb)
{
- return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) -
- (eb->start >> PAGE_SHIFT);
+ /*
+ * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
+ * sectorsize, it's just eb->len >> PAGE_SHIFT.
+ *
+ * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE,
+ * thus have to ensure we get at least one page.
+ */
+ return (eb->len >> PAGE_SHIFT) ?: 1;
}
-static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
@@ -238,86 +203,82 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
-void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
-void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
+void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
+void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
-void write_extent_buffer(struct extent_buffer *eb, const void *src,
+void write_extent_buffer(const struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
-void copy_extent_buffer_full(struct extent_buffer *dst,
- struct extent_buffer *src);
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+void copy_extent_buffer_full(const struct extent_buffer *dst,
+ const struct extent_buffer *src);
+void copy_extent_buffer(const struct extent_buffer *dst,
+ const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len);
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- unsigned long src_offset, unsigned long len);
-void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+void memcpy_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memmove_extent_buffer(const struct extent_buffer *dst,
+ unsigned long dst_offset, unsigned long src_offset,
+ unsigned long len);
+void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
-int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long pos);
-void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
-void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- unsigned long pos, unsigned long len);
-void clear_extent_buffer_dirty(struct extent_buffer *eb);
+void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ unsigned long start, unsigned long pos,
+ unsigned long len);
+void clear_extent_buffer_dirty(const struct extent_buffer *eb);
bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_under_io(struct extent_buffer *eb);
-int map_private_extent_buffer(const struct extent_buffer *eb,
- unsigned long offset, unsigned long min_len,
- char **map, unsigned long *map_start,
- unsigned long *map_len);
+int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned bits_to_clear,
- unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct bio *bio);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
+ u32 bits_to_clear, unsigned long page_ops);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
-struct btrfs_fs_info;
-struct btrfs_inode;
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
-int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num);
+int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
+ * mirrors. If another mirror has good data, the sector is set up to date
* and things continue. If a good mirror can't be found, the original
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
struct page *page;
- u64 start;
u64 len;
u64 logical;
- unsigned long bio_flags;
int this_mirror;
int failed_mirror;
- int in_validation;
+ int num_copies;
};
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
+ submit_bio_hook_t *submit_bio_hook);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset);
-bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- struct io_failure_record *failrec, int fail_mirror);
-struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- struct io_failure_record *failrec,
- struct page *page, int pg_offset, int icsum,
- bio_end_io_t *endio_func, void *data);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
@@ -325,4 +286,11 @@ bool find_lock_delalloc_range(struct inode *inode,
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+
+#ifdef CONFIG_BTRFS_DEBUG
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
+#else
+#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
+#endif
+
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index bd6229fb2b6f..6092a4eedc92 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -7,6 +7,7 @@
#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
+#include "btrfs_inode.h"
static struct kmem_cache *extent_map_cache;
@@ -54,9 +55,7 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
- em->generation = 0;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -73,7 +72,6 @@ void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(refcount_read(&em->refs) == 0);
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
@@ -143,8 +141,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
* it can't be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -152,6 +149,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct extent_map *entry;
struct extent_map *prev_entry = NULL;
+ ASSERT(prev_or_next_ret);
+
while (n) {
entry = rb_entry(n, struct extent_map, rb_node);
prev = n;
@@ -165,24 +164,29 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return n;
}
- if (prev_ret) {
- orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *prev_ret = prev;
- prev = orig_prev;
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
}
- if (next_ret) {
+ /*
+ * Previous extent map found, return as in this case the caller does not
+ * care about the next one.
+ */
+ if (prev) {
+ *prev_or_next_ret = prev;
+ return NULL;
+ }
+
+ prev = orig_prev;
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *next_ret = prev;
}
+ *prev_or_next_ret = prev;
+
return NULL;
}
@@ -261,6 +265,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -278,6 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
free_extent_map(merge);
}
}
@@ -334,6 +340,8 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
if (extent_map_in_tree(em))
try_merge_map(tree, em);
@@ -360,7 +368,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,19 +383,22 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
int i;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_bio_stripe *stripe = &map->stripes[i];
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
stripe->physical + stripe_size - 1, bits,
- 0, 0, NULL, GFP_NOWAIT, NULL);
+ NULL, GFP_NOWAIT, NULL);
}
}
/**
- * add_extent_mapping - add new extent map to the extent tree
+ * Add new extent map to the extent tree
+ *
* @tree: tree to insert new map in
* @em: map to insert
+ * @modified: indicate whether the given @em should be added to the
+ * modified list, which indicates the extent needs to be logged
*
* Insert @em into @tree or perform a simple forward/backward merge with
* existing mappings. The extent_map struct passed in will be inserted
@@ -420,16 +431,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
{
struct extent_map *em;
struct rb_node *rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *next = NULL;
+ struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
if (!rb_node) {
- if (prev)
- rb_node = prev;
- else if (next)
- rb_node = next;
+ if (prev_or_next)
+ rb_node = prev_or_next;
else
return NULL;
}
@@ -487,6 +495,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
@@ -501,6 +511,8 @@ void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *new,
int modified)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
@@ -574,12 +586,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
}
/**
- * btrfs_add_extent_mapping - add extent mapping into em_tree
- * @fs_info - used for tracepoint
- * @em_tree - the extent tree into which we want to insert the extent mapping
- * @em_in - extent we are inserting
- * @start - start of the logical range btrfs_get_extent() is requesting
- * @len - length of the logical range btrfs_get_extent() is requesting
+ * Add extent mapping into em_tree
+ *
+ * @fs_info: the filesystem
+ * @em_tree: extent tree into which we want to insert the extent mapping
+ * @em_in: extent we are inserting
+ * @start: start of the logical range btrfs_get_extent() is requesting
+ * @len: length of the logical range btrfs_get_extent() is requesting
*
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
@@ -648,3 +661,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
ASSERT(ret == 0 || ret == -EEXIST);
return ret;
}
+
+/*
+ * Drop all extent maps from a tree in the fastest possible way, rescheduling
+ * if needed. This avoids searching the tree, from the root down to the first
+ * extent map, before each deletion.
+ */
+static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+{
+ write_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ struct extent_map *em;
+ struct rb_node *node;
+
+ node = rb_first_cached(&tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(tree, em);
+ free_extent_map(em);
+ cond_resched_rwlock_write(&tree->lock);
+ }
+ write_unlock(&tree->lock);
+}
+
+/*
+ * Drop all extent maps in a given range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (inclusive value).
+ * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
+ *
+ * This drops all the extent maps that intersect the given range [@start, @end].
+ * Extent maps that partially overlap the range and extend behind or beyond it,
+ * are split.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
+ bool skip_pinned)
+{
+ struct extent_map *split;
+ struct extent_map *split2;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 len = end - start + 1;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ if (start == 0 && !skip_pinned) {
+ drop_all_extent_maps_fast(em_tree);
+ return;
+ }
+ len = (u64)-1;
+ } else {
+ /* Make end offset exclusive for use in the loop below. */
+ end++;
+ }
+
+ /*
+ * It's ok if we fail to allocate the extent maps, see the comment near
+ * the bottom of the loop below. We only need two spare extent maps in
+ * the worst case, where the first extent map that intersects our range
+ * starts before the range and the last extent map that intersects our
+ * range ends after our range (and they might be the same extent map),
+ * because we need to split those two extent maps at the boundaries.
+ */
+ split = alloc_extent_map();
+ split2 = alloc_extent_map();
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+
+ while (em) {
+ /* extent_map_end() returns exclusive value (last byte + 1). */
+ const u64 em_end = extent_map_end(em);
+ struct extent_map *next_em = NULL;
+ u64 gen;
+ unsigned long flags;
+ bool modified;
+ bool compressed;
+
+ if (em_end < end) {
+ next_em = next_extent_map(em);
+ if (next_em) {
+ if (next_em->start < end)
+ refcount_inc(&next_em->refs);
+ else
+ next_em = NULL;
+ }
+ }
+
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ start = em_end;
+ if (end != (u64)-1)
+ len = start + len - em_end;
+ goto next;
+ }
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+
+ /*
+ * The extent map does not cross our target range, so no need to
+ * split it, we can remove it directly.
+ */
+ if (em->start >= start && em_end <= end)
+ goto remove_em;
+
+ flags = em->flags;
+ gen = em->generation;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ if (em->start < start) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = em->start;
+ split->len = start - em->start;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
+ split->generation = gen;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ replace_extent_mapping(em_tree, em, split, modified);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em_end > end) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = start + len;
+ split->len = em_end - (start + len);
+ split->block_start = em->block_start;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ split->generation = gen;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->orig_start = em->orig_start;
+ } else {
+ const u64 diff = start + len - em->start;
+
+ split->block_len = split->len;
+ split->block_start += diff;
+ split->orig_start = em->orig_start;
+ }
+ } else {
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->orig_block_len = 0;
+ }
+
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ int ret;
+
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ /* Logic error, shouldn't happen. */
+ ASSERT(ret == 0);
+ if (WARN_ON(ret != 0) && modified)
+ btrfs_set_inode_full_sync(inode);
+ }
+ free_extent_map(split);
+ split = NULL;
+ }
+remove_em:
+ if (extent_map_in_tree(em)) {
+ /*
+ * If the extent map is still in the tree it means that
+ * either of the following is true:
+ *
+ * 1) It fits entirely in our range (doesn't end beyond
+ * it or starts before it);
+ *
+ * 2) It starts before our range and/or ends after our
+ * range, and we were not able to allocate the extent
+ * maps for split operations, @split and @split2.
+ *
+ * If we are at case 2) then we just remove the entire
+ * extent map - this is fine since if anyone needs it to
+ * access the subranges outside our range, will just
+ * load it again from the subvolume tree's file extent
+ * item. However if the extent map was in the list of
+ * modified extents, then we must mark the inode for a
+ * full fsync, otherwise a fast fsync will miss this
+ * extent if it's new and needs to be logged.
+ */
+ if ((em->start < start || em_end > end) && modified) {
+ ASSERT(!split);
+ btrfs_set_inode_full_sync(inode);
+ }
+ remove_extent_mapping(em_tree, em);
+ }
+
+ /*
+ * Once for the tree reference (we replaced or removed the
+ * extent map from the tree).
+ */
+ free_extent_map(em);
+next:
+ /* Once for us (for our lookup reference). */
+ free_extent_map(em);
+
+ em = next_em;
+ }
+
+ write_unlock(&em_tree->lock);
+
+ free_extent_map(split);
+ free_extent_map(split2);
+}
+
+/*
+ * Replace a range in the inode's extent map tree with a new extent map.
+ *
+ * @inode: The target inode.
+ * @new_em: The new extent map to add to the inode's extent map tree.
+ * @modified: Indicate if the new extent map should be added to the list of
+ * modified extents (for fast fsync tracking).
+ *
+ * Drops all the extent maps in the inode's extent map tree that intersect the
+ * range of the new extent map and adds the new extent map to the tree.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified)
+{
+ const u64 end = new_em->start + new_em->len - 1;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ int ret;
+
+ ASSERT(!extent_map_in_tree(new_em));
+
+ /*
+ * The caller has locked an appropriate file range in the inode's io
+ * tree, but getting -EEXIST when adding the new extent map can still
+ * happen in case there are extents that partially cover the range, and
+ * this is due to two tasks operating on different parts of the extent.
+ * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
+ * btrfs_get_extent") for an example and details.
+ */
+ do {
+ btrfs_drop_extent_map_range(inode, new_em->start, end, false);
+ write_lock(&tree->lock);
+ ret = add_extent_mapping(tree, new_em, modified);
+ write_unlock(&tree->lock);
+ } while (ret == -EEXIST);
+
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e217337dff9..ad311864272a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -25,6 +25,8 @@ enum {
EXTENT_FLAG_FILLING,
/* filesystem extent mapping type */
EXTENT_FLAG_FS_MAPPING,
+ /* This em is merged from two or more physically adjacent ems */
+ EXTENT_FLAG_MERGED,
};
struct extent_map {
@@ -40,6 +42,12 @@ struct extent_map {
u64 ram_bytes;
u64 block_start;
u64 block_len;
+
+ /*
+ * Generation of the extent map, for merged em it's the highest
+ * generation of all merged ems.
+ * For non-merged extents, it's from btrfs_file_extent_item::generation.
+ */
u64 generation;
unsigned long flags;
/* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
@@ -55,6 +63,8 @@ struct extent_map_tree {
rwlock_t lock;
};
+struct btrfs_inode;
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
@@ -96,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+ u64 start, u64 end,
+ bool skip_pinned);
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c2f365662d55..6bb9fa961a6a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,6 +9,7 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -23,6 +24,103 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
+/**
+ * Set inode's size according to filesystem options
+ *
+ * @inode: inode we want to update the disk_i_size for
+ * @new_i_size: i_size we want to set to, 0 if we use i_size
+ *
+ * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
+ * returns as it is perfectly fine with a file that has holes without hole file
+ * extent items.
+ *
+ * However without NO_HOLES we need to only return the area that is contiguous
+ * from the 0 offset of the file. Otherwise we could end up adjust i_size up
+ * to an extent that has a gap in between.
+ *
+ * Finally new_i_size should only be set in the case of truncate where we're not
+ * ready to use i_size_read() as the limiter yet.
+ */
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 start, end, i_size;
+ int ret;
+
+ i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
+ if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ inode->disk_i_size = i_size;
+ return;
+ }
+
+ spin_lock(&inode->lock);
+ ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
+ if (!ret && start == 0)
+ i_size = min(i_size, end + 1);
+ else
+ i_size = 0;
+ inode->disk_i_size = i_size;
+ spin_unlock(&inode->lock);
+}
+
+/**
+ * Mark range within a file as having a new extent inserted
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
+ *
+ * Call when we are inserting a new file extent where there was none before.
+ * Does not need to call this in the case where we're replacing an existing file
+ * extent, however if not sure it's fine to call this multiple times.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY);
+}
+
+/**
+ * Marks an inode range as not having a backing extent
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
+ *
+ * Called when we drop a file extent, for example when we truncate. Doesn't
+ * need to be called for cases where we're replacing a file extent, like when
+ * we've COWed a file extent.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
+ len == (u64)-1);
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return clear_extent_bit(&inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, NULL);
+}
+
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
{
@@ -31,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
return ncsums * fs_info->sectorsize;
}
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+/*
+ * Calculate the total size needed to allocate for an ordered sum structure
+ * spanning @bytes in the file.
+ */
+static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+{
+ int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
+
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+}
+
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding)
+ u64 objectid, u64 pos, u64 num_bytes)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -51,7 +157,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
@@ -60,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
- btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, 0);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_compression(leaf, item, compression);
- btrfs_set_file_extent_encryption(leaf, item, encryption);
- btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+ btrfs_set_file_extent_compression(leaf, item, 0);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
btrfs_mark_buffer_dirty(leaf);
out:
@@ -90,7 +195,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -110,8 +215,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
- csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+ fs_info->sectorsize_bits;
+ csums_in_item = btrfs_item_size(leaf, path->slots[0]);
csums_in_item /= csum_size;
if (csum_offset == csums_in_item) {
@@ -136,7 +241,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 objectid,
u64 offset, int mod)
{
- int ret;
struct btrfs_key file_key;
int ins_len = mod < 0 ? -1 : 0;
int cow = mod != 0;
@@ -144,64 +248,189 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
file_key.objectid = objectid;
file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
- ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+
+ return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+}
+
+/*
+ * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
+ * estore the result to @dst.
+ *
+ * Return >0 for the number of sectors we found.
+ * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
+ * for it. Caller may want to try next sector until one range is hit.
+ * Return <0 for fatal error.
+ */
+static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 disk_bytenr,
+ u64 len, u8 *dst)
+{
+ struct btrfs_root *csum_root;
+ struct btrfs_csum_item *item = NULL;
+ struct btrfs_key key;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 itemsize;
+ int ret;
+ u64 csum_start;
+ u64 csum_len;
+
+ ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) &&
+ IS_ALIGNED(len, sectorsize));
+
+ /* Check if the current csum item covers disk_bytenr */
+ if (path->nodes[0]) {
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+
+ if (in_range(disk_bytenr, csum_start, csum_len))
+ goto found;
+ }
+
+ /* Current item doesn't contain the desired range, search again */
+ btrfs_release_path(path);
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ ret = PTR_ERR(item);
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+ ASSERT(in_range(disk_bytenr, csum_start, csum_len));
+
+found:
+ ret = (min(csum_start + csum_len, disk_bytenr + len) -
+ disk_bytenr) >> fs_info->sectorsize_bits;
+ read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
+ ret * csum_size);
+out:
+ if (ret == -ENOENT || ret == -EFBIG)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Locate the file_offset of @cur_disk_bytenr of a @bio.
+ *
+ * Bio of btrfs represents read range of
+ * [bi_sector << 9, bi_sector << 9 + bi_size).
+ * Knowing this, we can iterate through each bvec to locate the page belong to
+ * @cur_disk_bytenr and get the file offset.
+ *
+ * @inode is used to determine if the bvec page really belongs to @inode.
+ *
+ * Return 0 if we can't find the file offset
+ * Return >0 if we find the file offset and restore it to @file_offset_ret
+ */
+static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
+ u64 disk_bytenr, u64 *file_offset_ret)
+{
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ int ret = 0;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ struct page *page = bvec.bv_page;
+
+ if (cur > disk_bytenr)
+ break;
+ if (cur + bvec.bv_len <= disk_bytenr) {
+ cur += bvec.bv_len;
+ continue;
+ }
+ ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
+ if (page->mapping && page->mapping->host &&
+ page->mapping->host == inode) {
+ ret = 1;
+ *file_offset_ret = page_offset(page) + bvec.bv_offset +
+ disk_bytenr - cur;
+ break;
+ }
+ }
return ret;
}
/**
- * btrfs_lookup_bio_sums - Look up checksums for a bio.
+ * Lookup the checksum for the read bio in csum tree.
+ *
* @inode: inode that the bio is for.
- * @bio: bio embedded in btrfs_io_bio.
- * @offset: Unless (u64)-1, look up checksums for this offset in the file.
- * If (u64)-1, use the page offsets from the bio instead.
- * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If
- * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead.
+ * @bio: bio to look up.
+ * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
+ * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
+ * NULL, the checksum buffer is allocated and returned in
+ * btrfs_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 offset, u8 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio_vec bvec;
- struct bvec_iter iter;
- struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
- struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_bio *bbio = NULL;
struct btrfs_path *path;
- const bool page_offsets = (offset == (u64)-1);
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 orig_len = bio->bi_iter.bi_size;
+ u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_bytenr;
u8 *csum;
- u64 item_start_offset = 0;
- u64 item_last_offset = 0;
- u64 disk_bytenr;
- u64 page_bytes_left;
- u32 diff;
- int nblocks;
+ const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ blk_status_t ret = BLK_STS_OK;
+
+ if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+ return BLK_STS_OK;
+ /*
+ * This function is only called for read bio.
+ *
+ * This means two things:
+ * - All our csums should only be in csum tree
+ * No ordered extents csums, as ordered extents are only for write
+ * path.
+ * - No need to bother any other info from bvec
+ * Since we're looking up csums, the only important info is the
+ * disk_bytenr and the length, which can be extracted from bi_iter
+ * directly.
+ */
+ ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
return BLK_STS_RESOURCE;
- nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
+ bbio = btrfs_bio(bio);
+
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
- GFP_NOFS);
- if (!btrfs_bio->csum) {
+ bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+ if (!bbio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
} else {
- btrfs_bio->csum = btrfs_bio->csum_inline;
+ bbio->csum = bbio->csum_inline;
}
- csum = btrfs_bio->csum;
+ csum = bbio->csum;
} else {
csum = dst;
}
- if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
+ /*
+ * If requested number of sectors is larger than one leaf can contain,
+ * kick the readahead for csum tree.
+ */
+ if (nblocks > fs_info->csums_per_leaf)
path->reada = READA_FORWARD;
/*
@@ -215,91 +444,75 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
path->skip_locking = 1;
}
- disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
-
- bio_for_each_segment(bvec, bio, iter) {
- page_bytes_left = bvec.bv_len;
- if (count)
- goto next;
-
- if (page_offsets)
- offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
- if (count)
- goto found;
+ for (cur_disk_bytenr = orig_disk_bytenr;
+ cur_disk_bytenr < orig_disk_bytenr + orig_len;
+ cur_disk_bytenr += (count * sectorsize)) {
+ u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
+ unsigned int sector_offset;
+ u8 *csum_dst;
- if (!item || disk_bytenr < item_start_offset ||
- disk_bytenr >= item_last_offset) {
- struct btrfs_key found_key;
- u32 item_size;
-
- if (item)
- btrfs_release_path(path);
- item = btrfs_lookup_csum(NULL, fs_info->csum_root,
- path, disk_bytenr, 0);
- if (IS_ERR(item)) {
- count = 1;
- memset(csum, 0, csum_size);
- if (BTRFS_I(inode)->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
- set_extent_bits(io_tree, offset,
- offset + fs_info->sectorsize - 1,
- EXTENT_NODATASUM);
- } else {
- btrfs_info_rl(fs_info,
- "no csum found for inode %llu start %llu",
- btrfs_ino(BTRFS_I(inode)), offset);
- }
- item = NULL;
- btrfs_release_path(path);
- goto found;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
-
- item_start_offset = found_key.offset;
- item_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
- item_last_offset = item_start_offset +
- (item_size / csum_size) *
- fs_info->sectorsize;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_csum_item);
+ /*
+ * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
+ * we're calculating the offset to the bio start.
+ *
+ * Bio size is limited to UINT_MAX, thus unsigned int is large
+ * enough to contain the raw result, not to mention the right
+ * shifted result.
+ */
+ ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
+ sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
+ fs_info->sectorsize_bits;
+ csum_dst = csum + sector_offset * csum_size;
+
+ count = search_csum_tree(fs_info, path, cur_disk_bytenr,
+ search_len, csum_dst);
+ if (count < 0) {
+ ret = errno_to_blk_status(count);
+ if (bbio)
+ btrfs_bio_free_csum(bbio);
+ break;
}
+
/*
- * this byte range must be able to fit inside
- * a single leaf so it will also fit inside a u32
+ * We didn't find a csum for this range. We need to make sure
+ * we complain loudly about this, because we are not NODATASUM.
+ *
+ * However for the DATA_RELOC inode we could potentially be
+ * relocating data extents for a NODATASUM inode, so the inode
+ * itself won't be marked with NODATASUM, but the extent we're
+ * copying is in fact NODATASUM. If we don't find a csum we
+ * assume this is the case.
*/
- diff = disk_bytenr - item_start_offset;
- diff = diff / fs_info->sectorsize;
- diff = diff * csum_size;
- count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
- inode->i_sb->s_blocksize_bits);
- read_extent_buffer(path->nodes[0], csum,
- ((unsigned long)item) + diff,
- csum_size * count);
-found:
- csum += count * csum_size;
- nblocks -= count;
-next:
- while (count > 0) {
- count--;
- disk_bytenr += fs_info->sectorsize;
- offset += fs_info->sectorsize;
- page_bytes_left -= fs_info->sectorsize;
- if (!page_bytes_left)
- break; /* move to next bio */
+ if (count == 0) {
+ memset(csum_dst, 0, csum_size);
+ count = 1;
+
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ u64 file_offset;
+ int ret;
+
+ ret = search_file_offset_in_bio(bio, inode,
+ cur_disk_bytenr, &file_offset);
+ if (ret)
+ set_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ } else {
+ btrfs_warn_rl(fs_info,
+ "csum hole found for disk bytenr range [%llu, %llu)",
+ cur_disk_bytenr, cur_disk_bytenr + sectorsize);
+ }
}
}
- WARN_ON_ONCE(count);
btrfs_free_path(path);
- return BLK_STS_OK;
+ return ret;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit)
+ struct list_head *list, int search_commit,
+ bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -312,7 +525,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -321,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
if (search_commit) {
path->skip_locking = 1;
path->reada = READA_FORWARD;
@@ -339,10 +553,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
- offset = (start - key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
- btrfs_item_size_nr(leaf, path->slots[0] - 1))
+ btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
@@ -367,7 +580,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.offset > start)
start = key.offset;
- size = btrfs_item_size_nr(leaf, path->slots[0]);
+ size = btrfs_item_size(leaf, path->slots[0]);
csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
if (csum_end <= start) {
path->slots[0]++;
@@ -390,10 +603,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->bytenr = start;
sums->len = (int)size;
- offset = (start - key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
offset *= csum_size;
- size >>= fs_info->sb->s_blocksize_bits;
+ size >>= fs_info->sectorsize_bits;
read_extent_buffer(path->nodes[0],
sums->sums,
@@ -418,34 +630,34 @@ fail:
return ret;
}
-/*
- * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
+/**
+ * Calculate checksums of the data contained inside a bio
+ *
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
- * @file_start: offset in file this bio begins to describe
- * @contig: Boolean. If true/1 means all bio vecs in this bio are
- * contiguous and they begin at @file_start in the file. False/0
- * means this bio can contains potentially discontigous bio vecs
- * so the logical offset of each should be calculated separately.
+ * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the
+ * file offsets are determined from the page offsets in the bio.
+ * Otherwise, this is the starting file offset of the bio vecs in
+ * @bio, which must be contiguous.
+ * @one_ordered: If true, @bio only refers to one ordered extent.
*/
-blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- u64 file_start, int contig)
+blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
+ u64 offset, bool one_ordered)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
+ const bool use_page_offsets = (offset == (u64)-1);
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
int index;
- int nr_sectors;
+ unsigned int blockcount;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
int i;
- u64 offset;
unsigned nofs_flag;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
@@ -458,32 +670,39 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
- if (contig)
- offset = file_start;
- else
- offset = 0; /* shut up gcc */
-
- sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
+ sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
- if (!contig)
+ if (use_page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
if (!ordered) {
ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered); /* Logic error */
+ /*
+ * The bio range is not covered by any ordered extent,
+ * must be a code logic error.
+ */
+ if (unlikely(!ordered)) {
+ WARN(1, KERN_WARNING
+ "no ordered extent for root %llu ino %llu offset %llu\n",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), offset);
+ kvfree(sums);
+ return BLK_STS_IOERR;
+ }
}
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
+ blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
- for (i = 0; i < nr_sectors; i++) {
- if (offset >= ordered->file_offset + ordered->num_bytes ||
- offset < ordered->file_offset) {
+ for (i = 0; i < blockcount; i++) {
+ if (!one_ordered &&
+ !in_range(offset, ordered->file_offset,
+ ordered->num_bytes)) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
@@ -502,19 +721,18 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
ordered = btrfs_lookup_ordered_extent(inode,
offset);
ASSERT(ordered); /* Logic error */
- sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
+ sums->bytenr = (bio->bi_iter.bi_sector << 9)
+ total_bytes;
index = 0;
}
- crypto_shash_init(shash);
- data = kmap_atomic(bvec.bv_page);
- crypto_shash_update(shash, data + bvec.bv_offset
- + (i * fs_info->sectorsize),
- fs_info->sectorsize);
- kunmap_atomic(data);
- crypto_shash_final(shash, (char *)(sums->sums + index));
- index += csum_size;
+ data = bvec_kmap_local(&bvec);
+ crypto_shash_digest(shash,
+ data + (i * fs_info->sectorsize),
+ fs_info->sectorsize,
+ sums->sums + index);
+ kunmap_local(data);
+ index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
total_bytes += fs_info->sectorsize;
@@ -544,14 +762,14 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
u64 csum_end;
u64 end_byte = bytenr + len;
- u32 blocksize_bits = fs_info->sb->s_blocksize_bits;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
leaf = path->nodes[0];
- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
- csum_end <<= fs_info->sb->s_blocksize_bits;
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
+ csum_end <<= blocksize_bits;
csum_end += key->offset;
if (key->offset < bytenr && csum_end <= end_byte) {
@@ -597,11 +815,11 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int blocksize_bits = fs_info->sb->s_blocksize_bits;
+ int ret = 0;
+ const u32 csum_size = fs_info->csum_size;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
- ASSERT(root == fs_info->csum_root ||
+ ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID ||
root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
@@ -613,9 +831,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -634,7 +852,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
if (key.offset >= end_byte)
break;
- csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+ csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key.offset;
@@ -672,7 +890,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -716,8 +934,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -727,12 +946,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -748,12 +996,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
- u32 nritems;
u32 ins_size;
int index = 0;
int found_next;
int ret;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
path = btrfs_alloc_path();
if (!path)
@@ -773,55 +1020,56 @@ again:
item_end = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((char *)item_end +
- btrfs_item_size_nr(leaf, path->slots[0]));
+ btrfs_item_size(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
if (ret != -EFBIG && ret != -ENOENT)
- goto fail_unlock;
+ goto out;
if (ret == -EFBIG) {
u32 item_size;
/* we found one, but it isn't big enough yet */
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if ((item_size / csum_size) >=
MAX_CSUM_ITEMS(fs_info, csum_size)) {
/* already at max size, make a new one */
goto insert;
}
} else {
- int slot = path->slots[0] + 1;
- /* we didn't find a csum item, insert one */
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (!nritems || (path->slots[0] >= nritems - 1)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 1)
- found_next = 1;
- if (ret != 0)
- goto insert;
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- found_key.type != BTRFS_EXTENT_CSUM_KEY) {
- found_next = 1;
- goto insert;
- }
- next_offset = found_key.offset;
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
found_next = 1;
goto insert;
}
/*
- * at this point, we know the tree has an item, but it isn't big
- * enough yet to put our csum in. Grow it
+ * At this point, we know the tree has a checksum item that ends at an
+ * offset matching the start of the checksum range we want to insert.
+ * We try to extend that item as much as possible and then add as many
+ * checksums to it as they fit.
+ *
+ * First check if the leaf has enough free space for at least one
+ * checksum. If it has go directly to the item extension code, otherwise
+ * release the path and do a search for insertion before the extension.
*/
+ if (btrfs_leaf_free_space(leaf) >= csum_size) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ csum_offset = (bytenr - found_key.offset) >>
+ fs_info->sectorsize_bits;
+ goto extend_csum;
+ }
+
btrfs_release_path(path);
+ path->search_for_extension = 1;
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
+ path->search_for_extension = 0;
if (ret < 0)
- goto fail_unlock;
+ goto out;
if (ret > 0) {
if (path->slots[0] == 0)
@@ -831,8 +1079,7 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits;
if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
@@ -840,30 +1087,64 @@ again:
goto insert;
}
- if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
+extend_csum:
+ if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
csum_size) {
int extend_nr;
u64 tmp;
u32 diff;
- u32 free_space;
-
- if (btrfs_leaf_free_space(leaf) <
- sizeof(struct btrfs_item) + csum_size * 2)
- goto insert;
- free_space = btrfs_leaf_free_space(leaf) -
- sizeof(struct btrfs_item) - csum_size;
tmp = sums->len - total_bytes;
- tmp >>= fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
- extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
- diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
- diff = min(free_space, diff);
+ diff = diff - btrfs_item_size(leaf, path->slots[0]);
+ diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
diff /= csum_size;
diff *= csum_size;
@@ -879,9 +1160,9 @@ insert:
u64 tmp;
tmp = sums->len - total_bytes;
- tmp >>= fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sectorsize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
- fs_info->sb->s_blocksize_bits);
+ fs_info->sectorsize_bits);
tmp = max_t(u64, 1, tmp);
tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size));
@@ -889,24 +1170,21 @@ insert:
} else {
ins_size = csum_size;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
- path->leave_spinning = 0;
if (ret < 0)
- goto fail_unlock;
+ goto out;
if (WARN_ON(ret != 0))
- goto fail_unlock;
+ goto out;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((unsigned char *)item +
- btrfs_item_size_nr(leaf, path->slots[0]));
+ btrfs_item_size(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
- ins_size = (u32)(sums->len - total_bytes) >>
- fs_info->sb->s_blocksize_bits;
+ ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits;
ins_size *= csum_size;
ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
ins_size);
@@ -926,9 +1204,6 @@ found:
out:
btrfs_free_path(path);
return ret;
-
-fail_unlock:
- goto out;
}
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -949,19 +1224,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
-
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- extent_end = extent_start +
- btrfs_file_extent_num_bytes(leaf, fi);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- size_t size;
- size = btrfs_file_extent_ram_bytes(leaf, fi);
- extent_end = ALIGN(extent_start + size,
- fs_info->sectorsize);
- }
-
+ extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
@@ -1007,3 +1272,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
root->root_key.objectid);
}
}
+
+/*
+ * Returns the end offset (non inclusive) of the file extent item the given path
+ * points to. If it points to an inline extent, the returned offset is rounded
+ * up to the sector size.
+ */
+u64 btrfs_file_extent_end(const struct btrfs_path *path)
+{
+ const struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ end = btrfs_file_extent_ram_bytes(leaf, fi);
+ end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
+ } else {
+ end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ }
+
+ return end;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a16da274c9aa..d01631d47806 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
#include <linux/btrfs.h>
#include <linux/uio.h>
#include <linux/iversion.h>
+#include <linux/fsverity.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -27,6 +28,8 @@
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "reflink.h"
+#include "subpage.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -47,11 +50,14 @@ struct inode_defrag {
/* root objectid */
u64 root;
- /* last offset we were able to defrag */
- u64 last_offset;
-
- /* if we've wrapped around back to zero once already */
- int cycled;
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents,
+ * thus needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
@@ -104,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
- if (defrag->last_offset > entry->last_offset)
- entry->last_offset = defrag->last_offset;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
return -EEXIST;
}
}
@@ -131,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -157,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
@@ -176,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
}
/*
- * Requeue the defrag object. If there is a defrag object that points to
- * the same inode in the tree, we will merge them together (by
- * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
- */
-static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- goto out;
-
- /*
- * Here we don't check the IN_DEFRAG flag, because we need merge
- * them together.
- */
- spin_lock(&fs_info->defrag_inodes_lock);
- ret = __btrfs_add_inode_defrag(inode, defrag);
- spin_unlock(&fs_info->defrag_inodes_lock);
- if (ret)
- goto out;
- return;
-out:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-}
-
-/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
@@ -274,70 +253,55 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *inode_root;
struct inode *inode;
- struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
- int num_defrag;
- int index;
- int ret;
-
- /* get the inode */
- key.objectid = defrag->root;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
+ int ret = 0;
+ u64 cur = 0;
- index = srcu_read_lock(&fs_info->subvol_srcu);
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ /* get the inode */
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
}
- key.objectid = defrag->ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, inode_root);
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
- range.start = defrag->last_offset;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
sb_start_write(fs_info->sb);
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == BTRFS_DEFRAG_BATCH) {
- defrag->last_offset = range.start;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
-
iput(inode);
- return 0;
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
cleanup:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
@@ -409,7 +373,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* Copy data from userspace to the current page
*/
- copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+ copied = copy_page_from_iter_atomic(page, offset, count, i);
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -423,20 +387,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
* The rest of the btrfs_file_write code will fall
* back to page at a time copies after we return 0.
*/
- if (!PageUptodate(page) && copied < count)
- copied = 0;
+ if (unlikely(copied < count)) {
+ if (!PageUptodate(page)) {
+ iov_iter_revert(i, copied);
+ copied = 0;
+ }
+ if (!copied)
+ break;
+ }
- iov_iter_advance(i, copied);
write_bytes -= copied;
total_copied += copied;
-
- /* Return to btrfs_file_write_iter to fault page */
- if (unlikely(copied == 0))
- break;
-
- if (copied < PAGE_SIZE - offset) {
- offset += copied;
- } else {
+ offset += copied;
+ if (offset == PAGE_SIZE) {
pg++;
offset = 0;
}
@@ -447,9 +410,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+ struct page **pages, size_t num_pages,
+ u64 pos, u64 copied)
{
size_t i;
+ u64 block_start = round_down(pos, fs_info->sectorsize);
+ u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+ ASSERT(block_len <= U32_MAX);
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
@@ -457,77 +426,44 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- ClearPageChecked(pages[i]);
+ btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+ block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
}
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
- const u64 start,
- const u64 len,
- struct extent_state **cached_state)
-{
- u64 search_start = start;
- const u64 end = start + len - 1;
-
- while (search_start < end) {
- const u64 search_len = end - search_start + 1;
- struct extent_map *em;
- u64 em_len;
- int ret = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- if (em->block_start != EXTENT_MAP_HOLE)
- goto next;
-
- em_len = em->len;
- if (em->start < search_start)
- em_len -= search_start - em->start;
- if (em_len > search_len)
- em_len = search_len;
-
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
-next:
- search_start = extent_map_end(em);
- free_extent_map(em);
- if (ret)
- return ret;
- }
- return 0;
-}
-
/*
- * after copy_from_user, pages need to be dirtied and we need to make
- * sure holes are created between the current EOF and the start of
- * any next extents (if required).
- *
- * this also makes the decision about creating an inline extent vs
- * doing real data extents, marking pages dirty and delalloc as required.
+ * After btrfs_copy_from_user(), update the following things for delalloc:
+ * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * Used to advise which range is to be written back.
+ * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Update inode size for past EOF write
*/
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached)
+ struct extent_state **cached, bool noreserve)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
- start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+ if (write_bytes == 0)
+ return 0;
+
+ if (noreserve)
+ extra_bits |= EXTENT_NORESERVE;
+
+ start_pos = round_down(pos, fs_info->sectorsize);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
+ ASSERT(num_bytes <= U32_MAX);
end_of_last_block = start_pos + num_bytes - 1;
@@ -535,27 +471,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached);
-
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
- if (start_pos >= isize &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
- /*
- * There can't be any extents following eof in this case
- * so just set the delalloc new bit for the range
- * directly.
- */
- extra_bits |= EXTENT_DELALLOC_NEW;
- } else {
- err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
- start_pos,
- num_bytes, cached);
- if (err)
- return err;
- }
- }
+ cached);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -564,9 +482,10 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- SetPageUptodate(p);
- ClearPageChecked(p);
- set_page_dirty(p);
+
+ btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
+ btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
+ btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
/*
@@ -575,164 +494,11 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
* at this time.
*/
if (end_pos > isize)
- i_size_write(inode, end_pos);
+ i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
/*
- * this drops all the extents in the cache that intersect the range
- * [start, end]. Existing extents are split as required.
- */
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned)
-{
- struct extent_map *em;
- struct extent_map *split = NULL;
- struct extent_map *split2 = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- u64 len = end - start + 1;
- u64 gen;
- int ret;
- int testend = 1;
- unsigned long flags;
- int compressed = 0;
- bool modified;
-
- WARN_ON(end < start);
- if (end == (u64)-1) {
- len = (u64)-1;
- testend = 0;
- }
- while (1) {
- int no_splits = 0;
-
- modified = false;
- if (!split)
- split = alloc_extent_map();
- if (!split2)
- split2 = alloc_extent_map();
- if (!split || !split2)
- no_splits = 1;
-
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
- write_unlock(&em_tree->lock);
- break;
- }
- flags = em->flags;
- gen = em->generation;
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- if (testend && em->start + em->len >= start + len) {
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- break;
- }
- start = em->start + em->len;
- if (testend)
- len = start + len - (em->start + em->len);
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- continue;
- }
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
- if (no_splits)
- goto next;
-
- if (em->start < start) {
- split->start = em->start;
- split->len = start - em->start;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
- split->ram_bytes = em->ram_bytes;
- } else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- split->ram_bytes = split->len;
- }
-
- split->generation = gen;
- split->flags = flags;
- split->compress_type = em->compress_type;
- replace_extent_mapping(em_tree, em, split, modified);
- free_extent_map(split);
- split = split2;
- split2 = NULL;
- }
- if (testend && em->start + em->len > start + len) {
- u64 diff = start + len - em->start;
-
- split->start = start + len;
- split->len = em->start + em->len - (start + len);
- split->flags = flags;
- split->compress_type = em->compress_type;
- split->generation = gen;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
- split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->block_start = em->block_start;
- split->orig_start = em->orig_start;
- } else {
- split->block_len = split->len;
- split->block_start = em->block_start
- + diff;
- split->orig_start = em->orig_start;
- }
- } else {
- split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- }
-
- if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
- } else {
- ret = add_extent_mapping(em_tree, split,
- modified);
- ASSERT(ret == 0); /* Logic error */
- }
- free_extent_map(split);
- split = NULL;
- }
-next:
- if (extent_map_in_tree(em))
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us */
- free_extent_map(em);
- /* once for the tree*/
- free_extent_map(em);
- }
- if (split)
- free_extent_map(split);
- if (split2)
- free_extent_map(split2);
-}
-
-/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file.
@@ -740,14 +506,16 @@ next:
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
+ *
+ * Note: the VFS' inode number of bytes is not updated, it's up to the caller
+ * to deal with that. We set the field 'bytes_found' of the arguments structure
+ * with the number of allocated bytes found in the target range, so that the
+ * caller can update the inode's number of bytes in an atomic way when
+ * replacing extents in a range to avoid races with stat(2).
*/
-int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache,
- int replace_extent,
- u32 extent_item_size,
- int *key_inserted)
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
@@ -755,13 +523,13 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- u64 ino = btrfs_ino(BTRFS_I(inode));
- u64 search_start = start;
+ u64 ino = btrfs_ino(inode);
+ u64 search_start = args->start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
u64 extent_offset = 0;
u64 extent_end = 0;
- u64 last_end = start;
+ u64 last_end = args->start;
int del_nr = 0;
int del_slot = 0;
int extent_type;
@@ -770,23 +538,36 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int modify_tree = -1;
int update_refs;
int found = 0;
- int leafs_visited = 0;
+ struct btrfs_path *path = args->path;
- if (drop_cache)
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
+ args->bytes_found = 0;
+ args->extent_inserted = false;
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ /* Must always have a path if ->replace_extent is true */
+ ASSERT(!(args->replace_extent && !args->path));
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (args->drop_cache)
+ btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
+
+ if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
- update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root);
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
search_start, modify_tree);
if (ret < 0)
break;
- if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == ino &&
@@ -794,7 +575,6 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
path->slots[0]--;
}
ret = 0;
- leafs_visited++;
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -806,7 +586,6 @@ next_slot:
ret = 0;
break;
}
- leafs_visited++;
leaf = path->nodes[0];
recow = 1;
}
@@ -821,7 +600,7 @@ next_slot:
path->slots[0]++;
goto next_slot;
}
- if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -863,7 +642,7 @@ next_slot:
}
found = 1;
- search_start = max(key.offset, start);
+ search_start = max(key.offset, args->start);
if (recow || !modify_tree) {
modify_tree = -1;
btrfs_release_path(path);
@@ -874,7 +653,7 @@ next_slot:
* | - range to drop - |
* | -------- extent -------- |
*/
- if (start > key.offset && end < extent_end) {
+ if (args->start > key.offset && args->end < extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
@@ -882,7 +661,7 @@ next_slot:
}
memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = start;
+ new_key.offset = args->start;
ret = btrfs_duplicate_item(trans, root, path,
&new_key);
if (ret == -EAGAIN) {
@@ -896,15 +675,15 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
- start - key.offset);
+ args->start - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- extent_offset += start - key.offset;
+ extent_offset += args->start - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - start);
+ extent_end - args->start);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
@@ -914,11 +693,12 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset);
+ args->start - extent_offset,
+ 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
- key.offset = start;
+ key.offset = args->start;
}
/*
* From here on out we will have actually dropped something, so
@@ -930,23 +710,23 @@ next_slot:
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
- if (start <= key.offset && end < extent_end) {
+ if (args->start <= key.offset && args->end < extent_end) {
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
}
memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = end;
+ new_key.offset = args->end;
btrfs_set_item_key_safe(fs_info, path, &new_key);
- extent_offset += end - key.offset;
+ extent_offset += args->end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - end);
+ extent_end - args->end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, end - key.offset);
+ args->bytes_found += args->end - key.offset;
break;
}
@@ -955,7 +735,7 @@ next_slot:
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
- if (start > key.offset && end >= extent_end) {
+ if (args->start > key.offset && args->end >= extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
@@ -963,11 +743,11 @@ next_slot:
}
btrfs_set_file_extent_num_bytes(leaf, fi,
- start - key.offset);
+ args->start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, extent_end - start);
- if (end == extent_end)
+ args->bytes_found += extent_end - args->start;
+ if (args->end == extent_end)
break;
path->slots[0]++;
@@ -978,7 +758,7 @@ next_slot:
* | ---- range to drop ----- |
* | ------ extent ------ |
*/
- if (start <= key.offset && end >= extent_end) {
+ if (args->start <= key.offset && args->end >= extent_end) {
delete_extent_item:
if (del_nr == 0) {
del_slot = path->slots[0];
@@ -990,8 +770,7 @@ delete_extent_item:
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(inode,
- extent_end - key.offset);
+ args->bytes_found += extent_end - key.offset;
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
@@ -1001,14 +780,14 @@ delete_extent_item:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
- key.offset - extent_offset);
+ key.offset - extent_offset, 0,
+ false);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(inode,
- extent_end - key.offset);
+ args->bytes_found += extent_end - key.offset;
}
- if (end == extent_end)
+ if (args->end == extent_end)
break;
if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
@@ -1038,7 +817,7 @@ delete_extent_item:
* Set path->slots[0] to first slot, so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs, we end up with a correct and adjusted
- * path->slots[0] for our insertion (if replace_extent != 0).
+ * path->slots[0] for our insertion (if args->replace_extent).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
@@ -1052,15 +831,14 @@ delete_extent_item:
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
- if (!ret && replace_extent && leafs_visited == 1 &&
- (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
- path->locks[0] == BTRFS_WRITE_LOCK) &&
+ if (!ret && args->replace_extent &&
+ path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >=
- sizeof(struct btrfs_item) + extent_item_size) {
+ sizeof(struct btrfs_item) + args->extent_item_size) {
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
+ key.offset = args->start;
if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
struct btrfs_key slot_key;
@@ -1068,34 +846,17 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
- *key_inserted = 1;
+ btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
+ args->extent_inserted = true;
}
- if (!replace_extent || !(*key_inserted))
+ if (!args->path)
+ btrfs_free_path(path);
+ else if (!args->extent_inserted)
btrfs_release_path(path);
- if (drop_end)
- *drop_end = found ? min(end, last_end) : end;
- return ret;
-}
-
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode, u64 start,
- u64 end, int drop_cache)
-{
- struct btrfs_path *path;
- int ret;
+out:
+ args->drop_end = found ? min(args->end, last_end) : args->end;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
- drop_cache, 0, 0, NULL);
- btrfs_free_path(path);
return ret;
}
@@ -1160,7 +921,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int del_nr = 0;
int del_slot = 0;
int recow;
- int ret;
+ int ret = 0;
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
@@ -1296,7 +1057,7 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
- orig_offset);
+ orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1321,7 +1082,8 @@ again:
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, 0);
- btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+ 0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1381,7 +1143,7 @@ again:
}
out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -1392,11 +1154,12 @@ static int prepare_uptodate_page(struct inode *inode,
struct page *page, u64 pos,
bool force_uptodate)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
!PageUptodate(page)) {
- ret = btrfs_readpage(NULL, page);
+ ret = btrfs_read_folio(NULL, folio);
if (ret)
return ret;
lock_page(page);
@@ -1404,7 +1167,18 @@ static int prepare_uptodate_page(struct inode *inode,
unlock_page(page);
return -EIO;
}
- if (page->mapping != inode->i_mapping) {
+
+ /*
+ * Since btrfs_read_folio() will unlock the folio before it
+ * returns, there is a window where btrfs_release_folio() can be
+ * called to release the page. Here we check both inode
+ * mapping and PagePrivate() to make sure the page was not
+ * released.
+ *
+ * The private flag check is essential for subpage as we need
+ * to store extra bitmap using page->private.
+ */
+ if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
unlock_page(page);
return -EAGAIN;
}
@@ -1412,26 +1186,60 @@ static int prepare_uptodate_page(struct inode *inode,
return 0;
}
+static unsigned int get_prepare_fgp_flags(bool nowait)
+{
+ unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+ if (nowait)
+ fgp_flags |= FGP_NOWAIT;
+
+ return fgp_flags;
+}
+
+static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
+{
+ gfp_t gfp;
+
+ gfp = btrfs_alloc_write_mask(inode->i_mapping);
+ if (nowait) {
+ gfp &= ~__GFP_DIRECT_RECLAIM;
+ gfp |= GFP_NOWAIT;
+ }
+
+ return gfp;
+}
+
/*
* this just gets pages into the page cache and locks them down.
*/
static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate)
+ size_t write_bytes, bool force_uptodate,
+ bool nowait)
{
int i;
unsigned long index = pos >> PAGE_SHIFT;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ gfp_t mask = get_prepare_gfp_flags(inode, nowait);
+ unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
int err = 0;
int faili;
for (i = 0; i < num_pages; i++) {
again:
- pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask | __GFP_WRITE);
+ pages[i] = pagecache_get_page(inode->i_mapping, index + i,
+ fgp_flags, mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
- err = -ENOMEM;
+ if (nowait)
+ err = -EAGAIN;
+ else
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = set_page_extent_mapped(pages[i]);
+ if (err < 0) {
+ faili = i;
goto fail;
}
@@ -1443,7 +1251,7 @@ again:
pos + write_bytes, false);
if (err) {
put_page(pages[i]);
- if (err == -EAGAIN) {
+ if (!nowait && err == -EAGAIN) {
err = 0;
goto again;
}
@@ -1478,7 +1286,7 @@ static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
size_t write_bytes,
- u64 *lockstart, u64 *lockend,
+ u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1488,28 +1296,37 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+
+ return -EAGAIN;
+ }
+ } else {
+ lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent_cached(&inode->io_tree, start_pos,
- last_pos, cached_state);
+ unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1522,29 +1339,36 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * It's possible the pages are dirty right now, but we don't want
- * to clean them yet because copy_from_user may catch a page fault
- * and we might have to fall back to one page at a time. If that
- * happens, we'll unlock these pages and we'd have a window where
- * reclaim could sneak in and drop the once-dirty page on the floor
- * without writing it.
- *
- * We have the pages locked and the extent range locked, so there's
- * no way someone can start IO on any dirty pages in this range.
- *
- * We'll call btrfs_dirty_pages() later on, and that will flip around
- * delalloc bits and dirty the pages as required.
+ * We should be called after prepare_pages() which should have locked
+ * all pages in the range.
*/
- for (i = 0; i < num_pages; i++) {
- set_page_extent_mapped(pages[i]);
+ for (i = 0; i < num_pages; i++)
WARN_ON(!PageLocked(pages[i]));
- }
return ret;
}
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset.
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range.
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * > 0 If we can nocow, and updates @write_bytes.
+ * 0 If we can't do a nocow write.
+ * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
+ * root is in progress.
+ * < 0 If an error happened.
+ *
+ * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1552,41 +1376,117 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- ret = btrfs_start_write_no_snapshotting(root);
- if (!ret)
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return 0;
+
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
-
- btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
- lockend, NULL);
-
num_bytes = lockend - lockstart + 1;
- ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
- if (ret <= 0) {
- ret = 0;
- btrfs_end_write_no_snapshotting(root);
+
+ if (nowait) {
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ return -EAGAIN;
+ }
} else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+ }
+ ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
+ NULL, NULL, NULL, nowait, false);
+ if (ret <= 0)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
- }
-
- unlock_extent(&inode->io_tree, lockstart, lockend);
+ unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
return ret;
}
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
+{
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
+}
+
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec64 now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_time(inode);
+ if (!timespec64_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec64_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
+static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
+ size_t count)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos = iocb->ki_pos;
+ int ret;
+ loff_t oldsize;
+ loff_t start_pos;
+
+ /*
+ * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
+ * prealloc flags, as without those flags we always have to COW. We will
+ * later check if we can really COW into the target range (using
+ * can_nocow_extent() at btrfs_get_blocks_direct_write()).
+ */
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return -EAGAIN;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
+
+ start_pos = round_down(pos, fs_info->sectorsize);
+ oldsize = i_size_read(inode);
+ if (start_pos > oldsize) {
+ /* Expand hole size to cover write data, preventing empty gap */
+ loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
+
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
+ if (ret) {
+ current->backing_dev_info = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
+ loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1594,17 +1494,39 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
u64 lockend;
size_t num_written = 0;
int nrptrs;
- int ret = 0;
+ ssize_t ret;
bool only_release_metadata = false;
bool force_page_uptodate = false;
+ loff_t old_isize = i_size_read(inode);
+ unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
+ unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+
+ if (nowait)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ ret = btrfs_inode_lock(inode, ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, i);
+ if (ret <= 0)
+ goto out;
+ ret = btrfs_write_check(iocb, i, ret);
+ if (ret < 0)
+ goto out;
+
+ pos = iocb->ki_pos;
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL;
@@ -1613,8 +1535,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
offset);
- size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_SIZE);
+ size_t num_pages;
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1622,72 +1543,82 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
size_t num_sectors;
int extents_locked;
- WARN_ON(num_pages > nrptrs);
-
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
- if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
ret = -EFAULT;
break;
}
only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
- reserve_bytes = round_up(write_bytes + sector_offset,
- fs_info->sectorsize);
extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
- write_bytes);
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &data_reserved, pos,
+ write_bytes, nowait);
if (ret < 0) {
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
- /*
- * For nodata cow case, no need to reserve
- * data space.
- */
- only_release_metadata = true;
- /*
- * our prealloc extent may be smaller than
- * write_bytes, so scale down.
- */
- num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_SIZE);
- reserve_bytes = round_up(write_bytes +
- sector_offset,
- fs_info->sectorsize);
- } else {
+ int can_nocow;
+
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
+ ret = -EAGAIN;
break;
}
+
+ /*
+ * If we don't have to COW at the offset, reserve
+ * metadata only. write_bytes may get smaller than
+ * requested here.
+ */
+ can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
+ break;
+ only_release_metadata = true;
}
+ num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
+ WARN_ON(num_pages > nrptrs);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes,
+ reserve_bytes, nowait);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
+
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
break;
}
release_bytes = reserve_bytes;
again:
+ ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
+ if (ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+ break;
+ }
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes,
- force_page_uptodate);
+ pos, write_bytes, force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
@@ -1697,10 +1628,11 @@ again:
extents_locked = lock_and_cleanup_extent_if_need(
BTRFS_I(inode), pages,
num_pages, pos, write_bytes, &lockstart,
- &lockend, &cached_state);
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
- if (extents_locked == -EAGAIN)
+ if (!nowait && extents_locked == -EAGAIN)
goto again;
+
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
ret = extents_locked;
@@ -1733,8 +1665,7 @@ again:
if (num_sectors > dirty_sectors) {
/* release everything except the sectors we dirtied */
- release_bytes -= dirty_sectors <<
- fs_info->sb->s_blocksize_bits;
+ release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
@@ -1744,7 +1675,7 @@ again:
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
@@ -1753,9 +1684,9 @@ again:
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
- if (copied > 0)
- ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
@@ -1765,40 +1696,25 @@ again:
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
else
free_extent_state(cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
break;
}
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_write_no_snapshotting(root);
-
- if (only_release_metadata && copied > 0) {
- lockstart = round_down(pos,
- fs_info->sectorsize);
- lockend = round_up(pos + copied,
- fs_info->sectorsize) - 1;
-
- set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_NORESERVE, NULL,
- NULL, GFP_NOFS);
- }
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(pages, num_pages);
+ btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
cond_resched();
- balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
- btrfs_btree_balance_dirty(fs_info);
-
pos += copied;
num_written += copied;
}
@@ -1807,34 +1723,177 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
}
extent_changeset_free(data_reserved);
+ if (num_written > 0) {
+ pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
+ iocb->ki_pos += num_written;
+ }
+out:
+ btrfs_inode_unlock(inode, ilock_flags);
return num_written ? num_written : ret;
}
-static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos;
- ssize_t written;
+ ssize_t written = 0;
ssize_t written_buffered;
+ size_t prev_left = 0;
loff_t endbyte;
- int err;
+ ssize_t err;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
- written = generic_file_direct_write(iocb, from);
+ /* If the write DIO is within EOF, use a shared lock */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
- if (written < 0 || !iov_iter_count(from))
- return written;
+relock:
+ err = btrfs_inode_lock(inode, ilock_flags);
+ if (err < 0)
+ return err;
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ return err;
+ }
+
+ err = btrfs_write_check(iocb, from, err);
+ if (err < 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto buffered;
+ }
+
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+ from->nofault = true;
+ dio = btrfs_dio_write(iocb, from, written);
+ from->nofault = false;
+
+ /*
+ * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+ * iocb, and that needs to lock the inode. So unlock it before calling
+ * iomap_dio_complete() to avoid a deadlock.
+ */
+ btrfs_inode_unlock(inode, ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio))
+ err = PTR_ERR_OR_ZERO(dio);
+ else
+ err = iomap_dio_complete(dio);
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (err > 0)
+ written = err;
+
+ if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ err = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto relock;
+ }
+ }
+
+ /*
+ * If 'err' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
+ */
+ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
+ goto out;
+
+buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * since we currently don't have NOWAIT semantics support for buffered IO
+ * and may block there for many reasons (reserving space for example).
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ err = -EAGAIN;
+ goto out;
+ }
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
@@ -1858,150 +1917,91 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out:
- return written ? written : err;
-}
-
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
- inode->i_mtime = now;
-
- if (!timespec64_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
+ return err < 0 ? err : written;
}
-static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
- struct iov_iter *from)
+static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 start_pos;
- u64 end_pos;
- ssize_t num_written = 0;
- const bool sync = iocb->ki_flags & IOCB_DSYNC;
- ssize_t err;
- loff_t pos;
- size_t count;
- loff_t oldsize;
- int clean_page = 0;
+ loff_t count;
+ ssize_t ret;
- if (!(iocb->ki_flags & IOCB_DIRECT) &&
- (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
- } else {
- inode_lock(inode);
- }
-
- err = generic_write_checks(iocb, from);
- if (err <= 0) {
- inode_unlock(inode);
- return err;
- }
-
- pos = iocb->ki_pos;
- count = iov_iter_count(from);
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ btrfs_inode_lock(inode, 0);
+ count = encoded->len;
+ ret = generic_write_checks_count(iocb, &count);
+ if (ret == 0 && count != encoded->len) {
/*
- * We will allocate space in case nodatacow is not set,
- * so bail
+ * The write got truncated by generic_write_checks_count(). We
+ * can't do a partial encoded write.
*/
- if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
- inode_unlock(inode);
- return -EAGAIN;
- }
+ ret = -EFBIG;
}
-
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err) {
- inode_unlock(inode);
+ if (ret || encoded->len == 0)
goto out;
- }
- /*
- * If BTRFS flips readonly due to some impossible error
- * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
- * although we have opened a file as writable, we have
- * to stop this write operation to ensure FS consistency.
- */
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- inode_unlock(inode);
- err = -EROFS;
+ ret = btrfs_write_check(iocb, from, encoded->len);
+ if (ret < 0)
goto out;
- }
+
+ ret = btrfs_do_encoded_write(iocb, from, encoded);
+out:
+ btrfs_inode_unlock(inode, 0);
+ return ret;
+}
+
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct file *file = iocb->ki_filp;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ ssize_t num_written, num_sync;
+ const bool sync = iocb_is_dsync(iocb);
/*
- * We reserve space for updating the inode when we reserve space for the
- * extent we are going to write, so we will enospc out there. We don't
- * need to start yet another transaction to update the inode as we will
- * update the inode when we finish writing whatever data we write.
+ * If the fs flips readonly due to some impossible error, although we
+ * have opened a file as writable, we have to stop this write operation
+ * to ensure consistency.
*/
- update_time_for_write(inode);
+ if (BTRFS_FS_ERROR(inode->root->fs_info))
+ return -EROFS;
- start_pos = round_down(pos, fs_info->sectorsize);
- oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
- /* Expand hole size to cover write data, preventing empty gap */
- end_pos = round_up(pos + count,
- fs_info->sectorsize);
- err = btrfs_cont_expand(inode, oldsize, end_pos);
- if (err) {
- inode_unlock(inode);
- goto out;
- }
- if (start_pos > round_up(oldsize, fs_info->sectorsize))
- clean_page = 1;
- }
+ if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (sync)
- atomic_inc(&BTRFS_I(inode)->sync_writers);
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = __btrfs_direct_write(iocb, from);
+ atomic_inc(&inode->sync_writers);
+
+ if (encoded) {
+ num_written = btrfs_encoded_write(iocb, from, encoded);
+ num_sync = encoded->len;
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
+ num_written = btrfs_direct_write(iocb, from);
+ num_sync = num_written;
} else {
num_written = btrfs_buffered_write(iocb, from);
- if (num_written > 0)
- iocb->ki_pos = pos + num_written;
- if (clean_page)
- pagecache_isize_extended(inode, oldsize,
- i_size_read(inode));
+ num_sync = num_written;
}
- inode_unlock(inode);
+ btrfs_set_inode_last_sub_trans(inode);
- /*
- * We also have to set last_sub_trans to the current log transid,
- * otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occurred.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
- if (num_written > 0)
- num_written = generic_write_sync(iocb, num_written);
+ if (num_sync > 0) {
+ num_sync = generic_write_sync(iocb, num_sync);
+ if (num_sync < 0)
+ num_written = num_sync;
+ }
if (sync)
- atomic_dec(&BTRFS_I(inode)->sync_writers);
-out:
+ atomic_dec(&inode->sync_writers);
+
current->backing_dev_info = NULL;
- return num_written ? num_written : err;
+ return num_written;
+}
+
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ return btrfs_do_write_iter(iocb, from, NULL);
}
int btrfs_release_file(struct inode *inode, struct file *filp)
@@ -2014,12 +2014,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2045,6 +2045,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
return ret;
}
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ list_empty(&ctx->ordered_extents))
+ return true;
+
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
+ if (inode->last_trans <= fs_info->last_trans_committed &&
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
+ list_empty(&ctx->ordered_extents)))
+ return true;
+
+ return false;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -2065,12 +2089,26 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
+ */
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
+
+ /*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
* multi-task, and make the performance up. See
@@ -2080,36 +2118,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- inode_lock(inode);
-
- /*
- * We take the dio_sem here because the tree log stuff can race with
- * lockless dio writes and get an extent map logged for an extent we
- * never waited on. We need it this high up for lockdep reasons.
- */
- down_write(&BTRFS_I(inode)->dio_sem);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection. Do this while holding the inode lock, to avoid races
- * with other tasks.
- */
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
-
- /*
- * Before we acquired the inode's lock, someone may have dirtied more
- * pages in the target range. We need to make sure that writeback for
- * any such pages does not start while we are logging the inode, because
- * if it does, any of the following might happen when we are not doing a
- * full inode sync:
+ * Before we acquired the inode's lock and the mmap lock, someone may
+ * have dirtied more pages in the target range. We need to make sure
+ * that writeback for any such pages does not start while we are logging
+ * the inode, because if it does, any of the following might happen when
+ * we are not doing a full inode sync:
*
* 1) We log an extent after its writeback finishes but before its
* checksums are added to the csum tree, leading to -EIO errors
@@ -2124,28 +2142,56 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
/*
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
+ * We check the flag here after starting delalloc above, because when
+ * running delalloc the full sync flag may be set if we need to drop
+ * extra extent map ranges due to temporary memory allocation failures.
+ */
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
+ *
+ * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
+ * logical address recorded in the ordered extent may change. We need
+ * to wait for the IO to stabilize the logical address.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync || btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
smp_mb();
- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2161,9 +2207,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2180,15 +2224,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
+ trans->in_fsync = true;
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
/* we've logged all the items and now have a consistent
@@ -2201,27 +2245,76 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret == BTRFS_NO_LOG_SYNC) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ /* We successfully logged the inode, attempt to sync the log. */
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
- ret = btrfs_sync_log(trans, root, &ctx);
- if (!ret) {
- ret = btrfs_end_transaction(trans);
- goto out;
- }
+ ret = btrfs_end_transaction(trans);
+ goto out;
}
- ret = btrfs_commit_transaction(trans);
- } else {
+ }
+
+ /*
+ * At this point we need to commit the transaction because we had
+ * btrfs_need_log_full_commit() or some other error.
+ *
+ * If we didn't do a full sync we have to stop the trans handle, wait on
+ * the ordered extents, start it again and commit the transaction. If
+ * we attempt to wait on the ordered extents here we could deadlock with
+ * something like fallocate() that is holding the extent lock trying to
+ * start a transaction while some other thread is trying to commit the
+ * transaction while we (fsync) are currently holding the transaction
+ * open.
+ */
+ if (!full_sync) {
ret = btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret)
+ goto out;
+
+ /*
+ * This is safe to use here because we're only interested in
+ * making sure the transaction that had the ordered extents is
+ * committed. We aren't waiting on anything past this point,
+ * we're purely getting the transaction and committing it.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+
+ /*
+ * We committed the transaction and there's no currently
+ * running transaction, this means everything we care
+ * about made it to disk and we are done.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
}
+
+ ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
+ ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2234,7 +2327,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct address_space *mapping = filp->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
@@ -2281,7 +2374,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct extent_map *hole_em;
- struct extent_map_tree *em_tree = &inode->extent_tree;
struct btrfs_key key;
int ret;
@@ -2315,6 +2407,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
@@ -2331,13 +2424,14 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
+ end - offset);
if (ret)
return ret;
@@ -2346,8 +2440,8 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_drop_extent_map_range(inode, offset, end - 1, false);
+ btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
hole_em->len = end - offset;
@@ -2360,16 +2454,10 @@ out:
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
- do {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- } while (ret == -EEXIST);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
if (ret)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
}
return 0;
@@ -2381,13 +2469,13 @@ out:
* em->start + em->len > start)
* When a hole extent is found, return 1 and modify start/len.
*/
-static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ em = btrfs_get_extent(inode, NULL, 0,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
@@ -2404,100 +2492,129 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
return ret;
}
-static int btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
{
- while (1) {
- struct btrfs_ordered_extent *ordered;
- int ret;
+ /*
+ * For subpage case, if the range is not at page boundary, we could
+ * have pages at the leading/tailing part of the range.
+ * This could lead to dead loop since filemap_range_has_page()
+ * will always return true.
+ * So here we need to do extra page alignment for
+ * filemap_range_has_page().
+ */
+ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
- * We need to make sure we have no ordered extents in this range
- * and nobody raced in and read a page in this range, if we did
- * we need to try again.
+ * We can't have ordered extents in the range, nor dirty/writeback
+ * pages, because we have locked the inode's VFS lock in exclusive
+ * mode, we have locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range and we have waited
+ * for any ordered extents in the range to complete.
+ * We can race with anyone reading pages from this range, so after
+ * locking the range check if we have pages in the range, and if
+ * we do, unlock the range and retry.
*/
- if ((!ordered ||
- (ordered->file_offset + ordered->num_bytes <= lockstart ||
- ordered->file_offset > lockend)) &&
- !filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
+ if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
+ page_lockend))
break;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, cached_state);
- ret = btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (ret)
- return ret;
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
- return 0;
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len,
+ const u64 bytes_to_drop)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
+ if (extent_info->disk_offset == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
+ }
- key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
+ replace_len);
+ if (ret)
+ return ret;
+
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
+ }
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(inode),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(inode), ref_offset, 0, false);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2505,26 +2622,27 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
- const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
- struct btrfs_trans_handle **trans_out)
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
- u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
u64 cur_offset;
- u64 drop_end;
u64 len = end - start;
int ret = 0;
@@ -2537,15 +2655,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
goto out;
}
rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2559,33 +2677,43 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
- BUG_ON(ret);
+ if (WARN_ON(ret))
+ goto out_trans;
trans->block_rsv = rsv;
cur_offset = start;
+ drop_args.path = path;
+ drop_args.end = end + 1;
+ drop_args.drop_cache = true;
while (cur_offset < end) {
- ret = __btrfs_drop_extents(trans, root, inode, path,
- cur_offset, end + 1, &drop_end,
- 1, 0, 0, NULL);
+ drop_args.start = cur_offset;
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ /* If we are punching a hole decrement the inode's byte count */
+ if (!extent_info)
+ btrfs_update_inode_bytes(inode, 0,
+ drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
- * When cloning we want to avoid transaction aborts when
- * nothing was done and we are attempting to clone parts
- * of inline extents, in such cases -EOPNOTSUPP is
- * returned by __btrfs_drop_extents() without having
- * changed anything in the file.
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
@@ -2596,23 +2724,62 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
+ /*
+ * We are past the i_size here, but since we didn't
+ * insert holes we need to clear the mapped area so we
+ * know to not set disk_i_size in this area until a new
+ * file extent is inserted here.
+ */
+ ret = btrfs_inode_clear_file_extent_range(inode,
+ cur_offset,
+ drop_args.drop_end - cur_offset);
+ if (ret) {
+ /*
+ * We couldn't clear our area, so we could
+ * presumably adjust up and corrupt the fs, so
+ * we need to abort.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info &&
+ drop_args.drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_args.drop_end -
+ extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len,
+ drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
- cur_offset = drop_end;
+ /*
+ * We are releasing our handle on the transaction, balance the
+ * dirty pages of the btree inode and flush delayed items, and
+ * then get a new transaction handle, which may now point to a
+ * new transaction in case someone else may have committed the
+ * transaction we used to replace/drop file extent items. So
+ * bump the inode's iversion and update mtime and ctime except
+ * if we are called from a dedupe context. This is because a
+ * power failure/crash may happen after the transaction is
+ * committed and before we finish replacing/dropping all the
+ * file extent items we need.
+ */
+ inode_inc_iversion(&inode->vfs_inode);
+
+ if (!extent_info || extent_info->update_times) {
+ inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
+ }
ret = btrfs_update_inode(trans, root, inode);
if (ret)
@@ -2630,10 +2797,13 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
+ if (WARN_ON(ret))
+ break;
trans->block_rsv = rsv;
- if (!clone_info) {
+ cur_offset = drop_args.drop_end;
+ len = end - cur_offset;
+ if (!extent_info && len) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2647,14 +2817,11 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
/*
* If we were cloning, force the next fsync to be a full one since we
* we replaced (or just dropped in the case of cloning holes when
- * NO_HOLES is enabled) extents and extent maps.
- * This is for the sake of simplicity, and cloning into files larger
- * than 16Mb would force the full fsync any way (when
- * try_release_extent_mapping() is invoked during page cache truncation.
+ * NO_HOLES is enabled) file extent items and did not setup new extent
+ * maps for the replacement extents (or holes).
*/
- if (clone_info)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ if (extent_info && !extent_info->is_new_extent)
+ btrfs_set_inode_full_sync(inode);
if (ret)
goto out_trans;
@@ -2671,25 +2838,36 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* will not record the existence of the hole region
* [existing_hole_start, lockend].
*/
- if (drop_end <= end)
- drop_end = end + 1;
+ if (drop_args.drop_end <= end)
+ drop_args.drop_end = end + 1;
/*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
+ if (!extent_info && cur_offset < ino_size &&
+ cur_offset < drop_args.drop_end) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
+ /* See the comment in the loop above for the reasoning here. */
+ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
+ drop_args.drop_end - cur_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, extent_info->data_len,
+ drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2711,8 +2889,9 @@ out:
return ret;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
@@ -2729,13 +2908,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
- return ret;
+ goto out_only_mutex;
- inode_lock(inode);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
- ret = find_first_non_hole(inode, &offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
@@ -2744,9 +2924,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
- lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
+ lockstart = round_up(offset, fs_info->sectorsize);
+ lockend = round_down(offset + len, fs_info->sectorsize) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2760,7 +2943,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode, offset, len, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
} else {
ret = 0;
}
@@ -2770,9 +2954,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode, offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
}
@@ -2785,7 +2969,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* after truncate page, check hole again */
len = offset + len - lockstart;
offset = lockstart;
- ret = find_first_non_hole(inode, &offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
@@ -2799,14 +2983,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
tail_start = lockend + 1;
tail_len = offset + len - tail_start;
if (tail_len) {
- ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
if (unlikely(ret < 0))
goto out_only_mutex;
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode,
+ ret = btrfs_truncate_block(BTRFS_I(inode),
tail_start + tail_len,
0, 1);
if (ret)
@@ -2820,10 +3004,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out_only_mutex;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
path = btrfs_alloc_path();
if (!path) {
@@ -2831,22 +3012,23 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
- &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
+ lockend, NULL, &trans);
btrfs_free_path(path);
if (ret)
goto out;
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -2867,13 +3049,13 @@ out_only_mutex:
} else {
int ret2;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
}
}
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@@ -2892,22 +3074,20 @@ struct falloc_range {
*/
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
- struct falloc_range *prev = NULL;
struct falloc_range *range = NULL;
- if (list_empty(head))
- goto insert;
-
- /*
- * As fallocate iterate by bytenr order, we only need to check
- * the last range.
- */
- prev = list_entry(head->prev, struct falloc_range, list);
- if (prev->start + prev->len == start) {
- prev->len += len;
- return 0;
+ if (!list_empty(head)) {
+ /*
+ * As fallocate iterates by bytenr order, we only need to check
+ * the last range.
+ */
+ range = list_last_entry(head, struct falloc_range, list);
+ if (range->start + range->len == start) {
+ range->len += len;
+ return 0;
+ }
}
-insert:
+
range = kmalloc(sizeof(*range), GFP_KERNEL);
if (!range)
return -ENOMEM;
@@ -2935,8 +3115,8 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_ordered_update_i_size(inode, end, NULL);
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
@@ -2948,15 +3128,15 @@ enum {
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
struct extent_map *em;
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2981,14 +3161,12 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = fs_info->sectorsize;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- inode_dio_wait(inode);
-
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
@@ -3048,7 +3226,8 @@ static int btrfs_zero_range(struct inode *inode,
}
if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
free_extent_map(em);
- ret = btrfs_truncate_block(inode, offset, len, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
@@ -3071,14 +3250,15 @@ static int btrfs_zero_range(struct inode *inode,
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(inode, offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret)
goto out;
} else {
@@ -3087,7 +3267,7 @@ static int btrfs_zero_range(struct inode *inode,
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3095,7 +3275,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
+ 0, 1);
if (ret)
goto out;
} else {
@@ -3115,20 +3296,21 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
- if (ret)
- goto out;
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
+ if (ret) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
+ }
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
offset + len, &alloc_hint);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -3138,7 +3320,7 @@ reserve_space:
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
@@ -3161,10 +3343,17 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
+ u64 data_space_needed = 0;
+ u64 data_space_reserved = 0;
+ u64 qgroup_reserved = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
+ /* Do not allow fallocate in ZONED mode */
+ if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+ return -EOPNOTSUPP;
+
alloc_start = round_down(offset, blocksize);
alloc_end = round_up(offset + len, blocksize);
cur_offset = alloc_start;
@@ -3175,21 +3364,9 @@ static long btrfs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
-
- /*
- * Only trigger disk allocation, don't trigger qgroup reserve
- *
- * For qgroup space, it will be checked later.
- */
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
- alloc_end - alloc_start);
- if (ret < 0)
- return ret;
- }
+ return btrfs_punch_hole(file, offset, len);
- inode_lock(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3197,6 +3374,10 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
@@ -3205,7 +3386,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* But that's a minor problem and won't do much harm BTW.
*/
if (alloc_start > inode->i_size) {
- ret = btrfs_cont_expand(inode, i_size_read(inode),
+ ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
alloc_start);
if (ret)
goto out;
@@ -3215,14 +3396,18 @@ static long btrfs_fallocate(struct file *file, int mode,
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
- ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
}
/*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
+ * We have locked the inode at the VFS level (in exclusive mode) and we
+ * have locked the i_mmap_lock lock (in exclusive mode). Now before
+ * locking the file range, flush all dealloc in the range and wait for
+ * all ordered extents in the range to complete. After this we can lock
+ * the file range and, due to the previous locking we did, we know there
+ * can't be more delalloc or ordered extents in the range.
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
@@ -3231,42 +3416,15 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
locked_end = alloc_end - 1;
- while (1) {
- struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
- /* the extent lock is ordered inside the running
- * transaction
- */
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
-
- if (ordered &&
- ordered->file_offset + ordered->num_bytes > alloc_start &&
- ordered->file_offset < alloc_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end,
- &cached_state);
- /*
- * we can't wait on the range with the transaction
- * running or with the extent lock held
- */
- ret = btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
- if (ret)
- goto out;
- } else {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- }
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
@@ -3283,46 +3441,64 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = add_falloc_range(&reserve_list, cur_offset,
- last_byte - cur_offset);
+ const u64 range_len = last_byte - cur_offset;
+
+ ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- cur_offset, last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset, range_len);
if (ret < 0) {
- cur_offset = last_byte;
free_extent_map(em);
break;
}
- } else {
- /*
- * Do not need to reserve unwritten extent for this
- * range, free reserved data space first, otherwise
- * it'll result in false ENOSPC error.
- */
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, last_byte - cur_offset);
+ qgroup_reserved += range_len;
+ data_space_needed += range_len;
}
free_extent_map(em);
cur_offset = last_byte;
}
+ if (!ret && data_space_needed > 0) {
+ /*
+ * We are safe to reserve space here as we can't have delalloc
+ * in the range, see above.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ data_space_needed);
+ if (!ret)
+ data_space_reserved = data_space_needed;
+ }
+
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
- if (!ret)
+ if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
- else
- btrfs_free_reserved_data_space(inode,
- data_reserved, range->start,
- range->len);
+ /*
+ * btrfs_prealloc_file_range() releases space even
+ * if it returns an error.
+ */
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (data_space_reserved > 0) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, range->start,
+ range->len);
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (qgroup_reserved > 0) {
+ btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
+ range->start, range->len);
+ qgroup_reserved -= range->len;
+ }
list_del(&range->list);
kfree(range);
}
@@ -3335,35 +3511,290 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
- inode_unlock(inode);
- /* Let go of our reservation. */
- if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, alloc_end - cur_offset);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
return ret;
}
-static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+/*
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
+ */
+static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ const u64 len = end + 1 - start;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ u64 em_end;
+ u64 delalloc_len;
+
+ /*
+ * Search the io tree first for EXTENT_DELALLOC. If we find any, it
+ * means we have delalloc (dirty pages) for which writeback has not
+ * started yet.
+ */
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1);
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
+ if (delalloc_len > 0)
+ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+
+ /*
+ * Now also check if there's any extent map in the range that does not
+ * map to a hole or prealloc extent. We do this because:
+ *
+ * 1) When delalloc is flushed, the file range is locked, we clear the
+ * EXTENT_DELALLOC bit from the io tree and create an extent map for
+ * an allocated extent. So we might just have been called after
+ * delalloc is flushed and before the ordered extent completes and
+ * inserts the new file extent item in the subvolume's btree;
+ *
+ * 2) We may have an extent map created by flushing delalloc for a
+ * subrange that starts before the subrange we found marked with
+ * EXTENT_DELALLOC in the io tree.
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ read_unlock(&em_tree->lock);
+
+ /* extent_map_end() returns a non-inclusive end offset. */
+ em_end = em ? extent_map_end(em) : 0;
+
+ /*
+ * If we have a hole/prealloc extent map, check the next one if this one
+ * ends before our range's end.
+ */
+ if (em && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+ struct extent_map *next_em;
+
+ read_lock(&em_tree->lock);
+ next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
+ read_unlock(&em_tree->lock);
+
+ free_extent_map(em);
+ em_end = next_em ? extent_map_end(next_em) : 0;
+ em = next_em;
+ }
+
+ if (em && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
+ /*
+ * No extent map or one for a hole or prealloc extent. Use the delalloc
+ * range we found in the io tree if we have one.
+ */
+ if (!em)
+ return (delalloc_len > 0);
+
+ /*
+ * We don't have any range as EXTENT_DELALLOC in the io tree, so the
+ * extent map is the only subrange representing delalloc.
+ */
+ if (delalloc_len == 0) {
+ *delalloc_start_ret = em->start;
+ *delalloc_end_ret = min(end, em_end - 1);
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map represents a delalloc range that starts before the
+ * delalloc range we found in the io tree.
+ */
+ if (em->start < *delalloc_start_ret) {
+ *delalloc_start_ret = em->start;
+ /*
+ * If the ranges are adjacent, return a combined range.
+ * Otherwise return the extent map's range.
+ */
+ if (em_end < *delalloc_start_ret)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map starts after the delalloc range we found in the io
+ * tree. If it's adjacent, return a combined range, otherwise return
+ * the range found in the io tree.
+ */
+ if (*delalloc_end_ret + 1 == em->start)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+}
+
+/*
+ * Check if there's delalloc in a given range.
+ *
+ * @inode: The inode.
+ * @start: The start offset of the range. It does not need to be
+ * sector size aligned.
+ * @end: The end offset (inclusive value) of the search range.
+ * It does not need to be sector size aligned.
+ * @delalloc_start_ret: Output argument, set to the start offset of the
+ * subrange found with delalloc (may not be sector size
+ * aligned).
+ * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
+ * of the subrange found with delalloc.
+ *
+ * Returns true if a subrange with delalloc is found within the given range, and
+ * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
+ * end offsets of the subrange.
+ */
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
+ u64 prev_delalloc_end = 0;
+ bool ret = false;
+
+ while (cur_offset < end) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ if (prev_delalloc_end == 0) {
+ /* First subrange found. */
+ *delalloc_start_ret = max(delalloc_start, start);
+ *delalloc_end_ret = delalloc_end;
+ ret = true;
+ } else if (delalloc_start == prev_delalloc_end + 1) {
+ /* Subrange adjacent to the previous one, merge them. */
+ *delalloc_end_ret = delalloc_end;
+ } else {
+ /* Subrange not adjacent to the previous one, exit. */
+ break;
+ }
+
+ prev_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Check if there's a hole or delalloc range in a range representing a hole (or
+ * prealloc extent) found in the inode's subvolume btree.
+ *
+ * @inode: The inode.
+ * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
+ * @start: Start offset of the hole region. It does not need to be sector
+ * size aligned.
+ * @end: End offset (inclusive value) of the hole region. It does not
+ * need to be sector size aligned.
+ * @start_ret: Return parameter, used to set the start of the subrange in the
+ * hole that matches the search criteria (seek mode), if such
+ * subrange is found (return value of the function is true).
+ * The value returned here may not be sector size aligned.
+ *
+ * Returns true if a subrange matching the given seek mode is found, and if one
+ * is found, it updates @start_ret with the start of the subrange.
+ */
+static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ u64 start, u64 end, u64 *start_ret)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+ &delalloc_start, &delalloc_end);
+ if (delalloc && whence == SEEK_DATA) {
+ *start_ret = delalloc_start;
+ return true;
+ }
+
+ if (delalloc && whence == SEEK_HOLE) {
+ /*
+ * We found delalloc but it starts after out start offset. So we
+ * have a hole between our start offset and the delalloc start.
+ */
+ if (start < delalloc_start) {
+ *start_ret = start;
+ return true;
+ }
+ /*
+ * Delalloc range starts at our start offset.
+ * If the delalloc range's length is smaller than our range,
+ * then it means we have a hole that starts where the delalloc
+ * subrange ends.
+ */
+ if (delalloc_end < end) {
+ *start_ret = delalloc_end + 1;
+ return true;
+ }
+
+ /* There's delalloc for the whole range. */
+ return false;
+ }
+
+ if (!delalloc && whence == SEEK_HOLE) {
+ *start_ret = start;
+ return true;
+ }
+
+ /*
+ * No delalloc in the range and we are seeking for data. The caller has
+ * to iterate to the next extent item in the subvolume btree.
+ */
+ return false;
+}
+
+static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map *em = NULL;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *cached_state = NULL;
- loff_t i_size = inode->i_size;
+ const loff_t i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 last_extent_end;
u64 lockstart;
u64 lockend;
u64 start;
- u64 len;
- int ret = 0;
+ int ret;
+ bool found = false;
if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
+ * Quick path. If the inode has no prealloc extents and its number of
+ * bytes used matches its i_size, then it can not have holes.
+ */
+ if (whence == SEEK_HOLE &&
+ !(inode->flags & BTRFS_INODE_PREALLOC) &&
+ inode_get_bytes(&inode->vfs_inode) == i_size)
+ return i_size;
+
+ /*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
@@ -3374,46 +3805,164 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
- len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ last_extent_end = lockstart;
+
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
while (start < i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- em = NULL;
- break;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *extent;
+ u64 extent_end;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
}
- if (whence == SEEK_HOLE &&
- (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
- break;
- else if (whence == SEEK_DATA &&
- (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
- start = em->start + em->len;
- free_extent_map(em);
- em = NULL;
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * In the first iteration we may have a slot that points to an
+ * extent that ends before our start offset, so skip it.
+ */
+ if (extent_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ /* We have an implicit hole, NO_HOLES feature is likely set. */
+ if (last_extent_end < key.offset) {
+ u64 search_start = last_extent_end;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ key.offset - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the extent.
+ */
+ }
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+ btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * Explicit hole or prealloc extent, search for delalloc.
+ * A prealloc extent is treated like a hole.
+ */
+ u64 search_start = key.offset;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ extent_end - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the next
+ * extent item.
+ */
+ } else {
+ /*
+ * Found a regular or inline extent.
+ * If we are seeking for data, adjust the start offset
+ * and stop, we're done.
+ */
+ if (whence == SEEK_DATA) {
+ start = max_t(u64, key.offset, offset);
+ found = true;
+ break;
+ }
+ /*
+ * Else, we are seeking for a hole, check the next file
+ * extent item.
+ */
+ }
+
+ start = extent_end;
+ last_extent_end = extent_end;
+ path->slots[0]++;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
}
- free_extent_map(em);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- if (ret) {
- offset = ret;
- } else {
- if (whence == SEEK_DATA && start >= i_size)
- offset = -ENXIO;
- else
- offset = min_t(loff_t, start, i_size);
+
+ /* We have an implicit hole from the last extent found up to i_size. */
+ if (!found && start < i_size) {
+ found = find_desired_extent_in_hole(inode, whence, start,
+ i_size - 1, &start);
+ if (!found)
+ start = i_size;
}
- return offset;
+out:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_free_path(path);
+
+ if (ret < 0)
+ return ret;
+
+ if (whence == SEEK_DATA && start >= i_size)
+ return -ENXIO;
+
+ return min_t(loff_t, start, i_size);
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -3425,9 +3974,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- inode_lock_shared(inode);
- offset = find_desired_extent(inode, offset, whence);
- inode_unlock_shared(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ offset = find_desired_extent(BTRFS_I(inode), offset, whence);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}
@@ -3439,18 +3988,126 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_NOWAIT;
+ int ret;
+
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
return generic_file_open(inode, filp);
}
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++)
+ for (i = seg + 1; i < iter->nr_segs; i++)
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+
+ if (fsverity_active(inode))
+ return 0;
+
+ if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = btrfs_dio_read(iocb, to, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ return ret < 0 ? ret : read;
+}
+
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = btrfs_direct_read(iocb, to);
+ if (ret < 0 || !iov_iter_count(to) ||
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
+ return ret;
+ }
+
+ return filemap_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
+ .splice_write = iter_file_splice_write,
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0598fd3c6e3f..f4023651dd68 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,17 +11,19 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
-#include "inode-map.h"
#include "volumes.h"
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
+#include "subpage.h"
+#include "inode-item.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -33,16 +35,36 @@ struct btrfs_trim_range {
struct list_head list;
};
-static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *bitmap_info);
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info);
-static int btrfs_wait_cache_io_root(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path);
+ struct btrfs_free_space *info, bool update_stat);
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes, bool for_alloc);
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info);
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes, bool update_stats);
+
+static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ free_bitmap(ctl, info);
+ }
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+}
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -82,7 +104,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, &location, root, path);
+ inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -122,10 +144,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
block_group->disk_cache_state = BTRFS_DC_CLEAR;
}
- if (!block_group->iref) {
+ if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
block_group->inode = igrab(inode);
- block_group->iref = 1;
- }
spin_unlock(&block_group->lock);
return inode;
@@ -141,17 +161,15 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
+ /* We inline CRCs for the free disk space cache */
+ const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
+ BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
int ret;
ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
- /* We inline crc's for the free disk space cache */
- if (ino != BTRFS_FREE_INO_OBJECTID)
- flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
-
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -199,7 +217,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
int ret;
u64 ino;
- ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
+ ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
if (ret < 0)
return ret;
@@ -207,6 +225,64 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
ino, block_group->start);
}
+/*
+ * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
+ * handles lookup, otherwise it takes ownership and iputs the inode.
+ * Don't reuse an inode pointer after passing it into this function.
+ */
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!inode)
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) != -ENOENT)
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for the lookup ref */
+ btrfs_add_delayed_iput(inode);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.type = 0;
+ key.offset = block_group->start;
+ ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
+ -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
@@ -228,9 +304,18 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct inode *inode)
+ struct inode *vfs_inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(vfs_inode),
+ .new_size = 0,
+ .ino = btrfs_ino(BTRFS_I(vfs_inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_state *cached_state = NULL;
int ret = 0;
bool locked = false;
@@ -260,15 +345,22 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
}
- btrfs_i_size_write(BTRFS_I(inode), 0);
- truncate_pagecache(inode, 0);
+ btrfs_i_size_write(inode, 0);
+ truncate_pagecache(vfs_inode, 0);
+
+ lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
- ret = btrfs_truncate_inode_items(trans, root, inode,
- 0, BTRFS_EXTENT_DATA_KEY);
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
+
+ unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -285,35 +377,24 @@ fail:
static void readahead_cache(struct inode *inode)
{
- struct file_ra_state *ra;
+ struct file_ra_state ra;
unsigned long last_index;
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
- if (!ra)
- return;
-
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
-
- kfree(ra);
+ page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
}
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int write)
{
int num_pages;
- int check_crcs = 0;
num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
- check_crcs = 1;
-
/* Make sure we can fit our crcs and generation into the first page */
- if (write && check_crcs &&
- (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
+ if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
return -ENOSPC;
memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
@@ -324,7 +405,6 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
io_ctl->num_pages = num_pages;
io_ctl->fs_info = btrfs_sb(inode->i_sb);
- io_ctl->check_crcs = check_crcs;
io_ctl->inode = inode;
return 0;
@@ -364,29 +444,43 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- ClearPageChecked(io_ctl->pages[i]);
+ btrfs_page_clear_checked(io_ctl->fs_info,
+ io_ctl->pages[i],
+ page_offset(io_ctl->pages[i]),
+ PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
}
}
-static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
- int uptodate)
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
struct page *page;
+ struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
for (i = 0; i < io_ctl->num_pages; i++) {
+ int ret;
+
page = find_or_create_page(inode->i_mapping, i, mask);
if (!page) {
io_ctl_drop_pages(io_ctl);
return -ENOMEM;
}
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ io_ctl_drop_pages(io_ctl);
+ return ret;
+ }
+
io_ctl->pages[i] = page;
if (uptodate && !PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
@@ -403,59 +497,43 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
}
}
- for (i = 0; i < io_ctl->num_pages; i++) {
+ for (i = 0; i < io_ctl->num_pages; i++)
clear_page_dirty_for_io(io_ctl->pages[i]);
- set_page_extent_mapped(io_ctl->pages[i]);
- }
return 0;
}
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
* Skip the csum areas. If we don't check crcs then we just have a
* 64bit chunk at the front of the first page.
*/
- if (io_ctl->check_crcs) {
- io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
- io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
- } else {
- io_ctl->cur += sizeof(u64);
- io_ctl->size -= sizeof(u64) * 2;
- }
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
* chunk at the front of the first page.
*/
- if (io_ctl->check_crcs) {
- io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
- io_ctl->size -= sizeof(u64) +
- (sizeof(u32) * io_ctl->num_pages);
- } else {
- io_ctl->cur += sizeof(u64);
- io_ctl->size -= sizeof(u64) * 2;
- }
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -469,11 +547,6 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
u32 crc = ~(u32)0;
unsigned offset = 0;
- if (!io_ctl->check_crcs) {
- io_ctl_unmap_page(io_ctl);
- return;
- }
-
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
@@ -491,11 +564,6 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
u32 crc = ~(u32)0;
unsigned offset = 0;
- if (!io_ctl->check_crcs) {
- io_ctl_map_page(io_ctl, 0);
- return 0;
- }
-
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
@@ -525,8 +593,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -599,8 +667,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -628,42 +696,48 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
return 0;
}
-/*
- * Since we attach pinned extents after the fact we can have contiguous sections
- * of free space that are split up in entries. This poses a problem with the
- * tree logging stuff since it could have allocated across what appears to be 2
- * entries since we would have merged the entries when adding the pinned extents
- * back to the free space cache. So run through the space cache that we just
- * loaded and merge contiguous entries. This will make the log replay stuff not
- * blow up and it will make for nicer allocator behavior.
- */
-static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_free_space *e, *prev = NULL;
- struct rb_node *n;
+ struct btrfs_block_group *block_group = ctl->block_group;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+ u64 size = block_group->length;
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
-again:
- spin_lock(&ctl->tree_lock);
- for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
- e = rb_entry(n, struct btrfs_free_space, offset_index);
- if (!prev)
- goto next;
- if (e->bitmap || prev->bitmap)
- goto next;
- if (prev->offset + prev->bytes == e->offset) {
- unlink_free_space(ctl, prev);
- unlink_free_space(ctl, e);
- prev->bytes += e->bytes;
- kmem_cache_free(btrfs_free_space_cachep, e);
- link_free_space(ctl, prev);
- prev = NULL;
- spin_unlock(&ctl->tree_lock);
- goto again;
- }
-next:
- prev = e;
- }
- spin_unlock(&ctl->tree_lock);
+ max_bitmaps = max_t(u64, max_bitmaps, 1);
+
+ if (ctl->total_bitmaps > max_bitmaps)
+ btrfs_err(block_group->fs_info,
+"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
+ block_group->start, block_group->length,
+ ctl->total_bitmaps, ctl->unit, max_bitmaps,
+ bytes_per_bg);
+ ASSERT(ctl->total_bitmaps <= max_bitmaps);
+
+ /*
+ * We are trying to keep the total amount of memory used per 1GiB of
+ * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
+ * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
+ * bitmaps, we may end up using more memory than this.
+ */
+ if (size < SZ_1G)
+ max_bytes = MAX_CACHE_BYTES_PER_GIG;
+ else
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
+
+ bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the max
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+
+ ctl->extents_thresh =
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
@@ -732,7 +806,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
readahead_cache(inode);
- ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+ ret = io_ctl_prepare_pages(&io_ctl, true);
if (ret)
goto out;
@@ -747,8 +821,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
while (num_entries) {
e = kmem_cache_zalloc(btrfs_free_space_cachep,
GFP_NOFS);
- if (!e)
+ if (!e) {
+ ret = -ENOMEM;
goto free_cache;
+ }
ret = io_ctl_read_entry(&io_ctl, e, &type);
if (ret) {
@@ -756,17 +832,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
goto free_cache;
}
- /*
- * Sync discard ensures that the free space cache is always
- * trimmed. So when reading this in, the state should reflect
- * that. We also do this for async as a stop gap for lack of
- * persistence.
- */
- if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC))
- e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
-
if (!e->bytes) {
+ ret = -1;
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -787,6 +854,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
e->bitmap = kmem_cache_zalloc(
btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!e->bitmap) {
+ ret = -ENOMEM;
kmem_cache_free(
btrfs_free_space_cachep, e);
goto free_cache;
@@ -794,7 +862,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
btrfs_err(fs_info,
@@ -819,31 +887,64 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = io_ctl_read_bitmap(&io_ctl, e);
if (ret)
goto free_cache;
- e->bitmap_extents = count_bitmap_extents(ctl, e);
- if (!btrfs_free_space_trimmed(e)) {
- ctl->discardable_extents[BTRFS_STAT_CURR] +=
- e->bitmap_extents;
- ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
- }
}
io_ctl_drop_pages(&io_ctl);
- merge_space_tree(ctl);
ret = 1;
out:
- btrfs_discard_update_discardable(ctl->private, ctl);
io_ctl_free(&io_ctl);
return ret;
free_cache:
io_ctl_drop_pages(&io_ctl);
+
+ spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache(ctl);
+ spin_unlock(&ctl->tree_lock);
goto out;
}
+static int copy_free_space_cache(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int ret = 0;
+
+ while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ ret = btrfs_add_free_space(block_group, info->offset,
+ info->bytes);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ u64 offset = info->offset;
+ u64 bytes = ctl->unit;
+
+ while (search_bitmap(ctl, info, &offset, &bytes,
+ false) == 0) {
+ ret = btrfs_add_free_space(block_group, offset,
+ bytes);
+ if (ret)
+ break;
+ bitmap_clear_bits(ctl, info, offset, bytes, true);
+ offset = info->offset;
+ bytes = ctl->unit;
+ }
+ free_bitmap(ctl, info);
+ }
+ cond_resched();
+ }
+ return ret;
+}
+
+static struct lock_class_key btrfs_free_space_inode_key;
+
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space_ctl tmp_ctl = {};
struct inode *inode;
struct btrfs_path *path;
int ret = 0;
@@ -851,6 +952,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
u64 used = block_group->used;
/*
+ * Because we could potentially discard our loaded free space, we want
+ * to load everything into a temporary structure first, and then if it's
+ * valid copy it all into the actual free space ctl.
+ */
+ btrfs_init_free_space_ctl(block_group, &tmp_ctl);
+
+ /*
* If this block group has been marked to be cleared for one reason or
* another then we can't trust the on disk cache, so just return.
*/
@@ -901,19 +1009,39 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
}
spin_unlock(&block_group->lock);
- ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+ /*
+ * Reinitialize the class of struct inode's mapping->invalidate_lock for
+ * free space inodes to prevent false positives related to locks for normal
+ * inodes.
+ */
+ lockdep_set_class(&(&inode->i_data)->invalidate_lock,
+ &btrfs_free_space_inode_key);
+
+ ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
- spin_lock(&ctl->tree_lock);
- matched = (ctl->free_space == (block_group->length - used -
- block_group->bytes_super));
- spin_unlock(&ctl->tree_lock);
+ matched = (tmp_ctl.free_space == (block_group->length - used -
+ block_group->bytes_super));
- if (!matched) {
- __btrfs_remove_free_space_cache(ctl);
+ if (matched) {
+ ret = copy_free_space_cache(block_group, &tmp_ctl);
+ /*
+ * ret == 1 means we successfully loaded the free space cache,
+ * so we need to re-set it here.
+ */
+ if (ret == 0)
+ ret = 1;
+ } else {
+ /*
+ * We need to call the _locked variant so we don't try to update
+ * the discard counters.
+ */
+ spin_lock(&tmp_ctl.tree_lock);
+ __btrfs_remove_free_space_cache(&tmp_ctl);
+ spin_unlock(&tmp_ctl.tree_lock);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
@@ -932,6 +1060,9 @@ out:
block_group->start);
}
+ spin_lock(&ctl->tree_lock);
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
iput(inode);
return ret;
}
@@ -1032,7 +1163,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1044,8 +1175,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC, 0,
- 0, NULL);
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1067,6 +1198,7 @@ fail:
}
static noinline_for_stack int write_pinned_extent_entries(
+ struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
@@ -1085,7 +1217,7 @@ static noinline_for_stack int write_pinned_extent_entries(
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
- unpin = block_group->fs_info->pinned_extents;
+ unpin = &trans->transaction->pinned_extents;
start = block_group->start;
@@ -1140,7 +1272,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1160,8 +1292,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1185,19 +1317,15 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
- if (block_group) {
-#ifdef DEBUG
- btrfs_err(root->fs_info,
- "failed to write free space cache for block group %llu",
- block_group->start);
-#endif
- }
+ if (block_group)
+ btrfs_debug(root->fs_info,
+ "failed to write free space cache for block group %llu error %d",
+ block_group->start, ret);
}
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
@@ -1226,14 +1354,6 @@ out:
}
-static int btrfs_wait_cache_io_root(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path)
-{
- return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
-}
-
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
@@ -1244,11 +1364,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
}
/**
- * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
+ * Write out cached info to an inode
+ *
+ * @root: root the inode belongs to
+ * @inode: freespace inode we are writing out
+ * @ctl: free space cache we are going to write out
+ * @block_group: block_group for this cache if it belongs to a block_group
+ * @io_ctl: holds context for the io
+ * @trans: the trans handle
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successful in writing the cache out,
@@ -1291,12 +1414,12 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}
/* Lock all pages first so we can lock the extent safely. */
- ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+ ret = io_ctl_prepare_pages(io_ctl, false);
if (ret)
goto out_unlock;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1317,7 +1440,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
+ ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
if (ret)
goto out_nospc_locked;
@@ -1336,8 +1459,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
- i_size_read(inode), &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
+ io_ctl->num_pages, 0, i_size_read(inode),
+ &cached_state, false);
if (ret)
goto out_nospc;
@@ -1348,13 +1472,14 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
@@ -1366,18 +1491,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
return 0;
-out:
- io_ctl->inode = NULL;
- io_ctl_free(io_ctl);
- if (ret) {
- invalidate_inode_pages2(inode->i_mapping);
- BTRFS_I(inode)->generation = 0;
- }
- btrfs_update_inode(trans, root, inode);
- if (must_iput)
- iput(inode);
- return ret;
-
out_nospc_locked:
cleanup_bitmap_list(&bitmap_list);
spin_unlock(&ctl->tree_lock);
@@ -1390,7 +1503,17 @@ out_unlock:
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
- goto out;
+out:
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ }
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (must_iput)
+ iput(inode);
+ return ret;
}
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
@@ -1416,11 +1539,9 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
block_group, &block_group->io_ctl, trans);
if (ret) {
-#ifdef DEBUG
- btrfs_err(fs_info,
- "failed to write free space cache for block group %llu",
- block_group->start);
-#endif
+ btrfs_debug(fs_info,
+ "failed to write free space cache for block group %llu error %d",
+ block_group->start, ret);
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
@@ -1517,6 +1638,50 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
}
/*
+ * This is a little subtle. We *only* have ->max_extent_size set if we actually
+ * searched through the bitmap and figured out the largest ->max_extent_size,
+ * otherwise it's 0. In the case that it's 0 we don't want to tell the
+ * allocator the wrong thing, we want to use the actual real max_extent_size
+ * we've found already if it's larger, or we want to use ->bytes.
+ *
+ * This matters because find_free_space() will skip entries who's ->bytes is
+ * less than the required bytes. So if we didn't search down this bitmap, we
+ * may pick some previous entry that has a smaller ->max_extent_size than we
+ * have. For example, assume we have two entries, one that has
+ * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set
+ * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will
+ * call into find_free_space(), and return with max_extent_size == 4K, because
+ * that first bitmap entry had ->max_extent_size set, but the second one did
+ * not. If instead we returned 8K we'd come in searching for 8K, and find the
+ * 8K contiguous range.
+ *
+ * Consider the other case, we have 2 8K chunks in that second entry and still
+ * don't have ->max_extent_size set. We'll return 16K, and the next time the
+ * allocator comes in it'll fully search our second bitmap, and this time it'll
+ * get an uptodate value of 8K as the maximum chunk size. Then we'll get the
+ * right allocation the next loop through.
+ */
+static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
+{
+ if (entry->bitmap && entry->max_extent_size)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
+/*
+ * We want the largest entry to be leftmost, so this is inverted from what you'd
+ * normally expect.
+ */
+static bool entry_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct btrfs_free_space *entry, *exist;
+
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
+ return get_max_extent_size(exist) < get_max_extent_size(entry);
+}
+
+/*
* searches the tree for the given offset.
*
* fuzzy - If this is set, then we are trying to make an allocation, and we just
@@ -1528,15 +1693,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
u64 offset, int bitmap_only, int fuzzy)
{
struct rb_node *n = ctl->free_space_offset.rb_node;
- struct btrfs_free_space *entry, *prev = NULL;
+ struct btrfs_free_space *entry = NULL, *prev = NULL;
/* find entry that is closest to the 'offset' */
- while (1) {
- if (!n) {
- entry = NULL;
- break;
- }
-
+ while (n) {
entry = rb_entry(n, struct btrfs_free_space, offset_index);
prev = entry;
@@ -1546,6 +1706,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
n = n->rb_right;
else
break;
+
+ entry = NULL;
}
if (bitmap_only) {
@@ -1622,6 +1784,10 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
return NULL;
while (1) {
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
if (entry->offset + BITS_PER_BITMAP *
ctl->unit > offset)
@@ -1630,33 +1796,25 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
if (entry->offset + entry->bytes > offset)
break;
}
-
- n = rb_next(&entry->offset_index);
- if (!n)
- return NULL;
- entry = rb_entry(n, struct btrfs_free_space, offset_index);
}
return entry;
}
-static inline void
-__unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
+static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ bool update_stat)
{
rb_erase(&info->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
ctl->free_extents--;
if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
}
-}
-static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- __unlink_free_space(ctl, info);
- ctl->free_space -= info->bytes;
+ if (update_stat)
+ ctl->free_space -= info->bytes;
}
static int link_free_space(struct btrfs_free_space_ctl *ctl,
@@ -1670,6 +1828,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
if (ret)
return ret;
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
+
if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR]++;
ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
@@ -1680,47 +1840,25 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
return ret;
}
-static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
{
- struct btrfs_block_group *block_group = ctl->private;
- u64 max_bytes;
- u64 bitmap_bytes;
- u64 extent_bytes;
- u64 size = block_group->length;
- u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
-
- max_bitmaps = max_t(u64, max_bitmaps, 1);
-
- ASSERT(ctl->total_bitmaps <= max_bitmaps);
-
- /*
- * We are trying to keep the total amount of memory used per 1GiB of
- * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
- * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
- * bitmaps, we may end up using more memory than this.
- */
- if (size < SZ_1G)
- max_bytes = MAX_CACHE_BYTES_PER_GIG;
- else
- max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
-
- bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+ ASSERT(info->bitmap);
/*
- * we want the extent entry threshold to always be at most 1/2 the max
- * bytes we can have, or whatever is less than that.
+ * If our entry is empty it's because we're on a cluster and we don't
+ * want to re-link it into our ctl bytes index.
*/
- extent_bytes = max_bytes - bitmap_bytes;
- extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+ if (RB_EMPTY_NODE(&info->bytes_index))
+ return;
- ctl->extents_thresh =
- div_u64(extent_bytes, sizeof(struct btrfs_free_space));
+ rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
+ rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
}
-static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info,
- u64 offset, u64 bytes)
+static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info,
+ u64 offset, u64 bytes, bool update_stat)
{
unsigned long start, count, end;
int extent_delta = -1;
@@ -1736,6 +1874,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
if (info->max_extent_size > ctl->unit)
info->max_extent_size = 0;
+ relink_bitmap_entry(ctl, info);
+
if (start && test_bit(start - 1, info->bitmap))
extent_delta++;
@@ -1747,14 +1887,9 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
}
-}
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info, u64 offset,
- u64 bytes)
-{
- __bitmap_clear_bits(ctl, info, offset, bytes);
- ctl->free_space -= bytes;
+ if (update_stat)
+ ctl->free_space -= bytes;
}
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
@@ -1771,9 +1906,16 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
bitmap_set(info->bitmap, start, count);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
info->bytes += bytes;
ctl->free_space += bytes;
+ relink_bitmap_entry(ctl, info);
+
if (start && test_bit(start - 1, info->bitmap))
extent_delta--;
@@ -1841,20 +1983,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
*bytes = (u64)(max_bits) * ctl->unit;
bitmap_info->max_extent_size = *bytes;
+ relink_bitmap_entry(ctl, bitmap_info);
return -1;
}
-static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
-{
- if (entry->bitmap)
- return entry->max_extent_size;
- return entry->bytes;
-}
-
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
- unsigned long align, u64 *max_extent_size)
+ unsigned long align, u64 *max_extent_size, bool use_bytes_index)
{
struct btrfs_free_space *entry;
struct rb_node *node;
@@ -1864,16 +2000,38 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (!ctl->free_space_offset.rb_node)
goto out;
+again:
+ if (use_bytes_index) {
+ node = rb_first_cached(&ctl->free_space_bytes);
+ } else {
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
+ 0, 1);
+ if (!entry)
+ goto out;
+ node = &entry->offset_index;
+ }
- entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
- if (!entry)
- goto out;
+ for (; node; node = rb_next(node)) {
+ if (use_bytes_index)
+ entry = rb_entry(node, struct btrfs_free_space,
+ bytes_index);
+ else
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
- for (node = &entry->offset_index; node; node = rb_next(node)) {
- entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ /*
+ * If we are using the bytes index then all subsequent entries
+ * in this tree are going to be < bytes, so simply set the max
+ * extent size and exit the loop.
+ *
+ * If we're using the offset index then we need to keep going
+ * through the rest of the tree.
+ */
if (entry->bytes < *bytes) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
+ if (use_bytes_index)
+ break;
continue;
}
@@ -1890,6 +2048,13 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
tmp = entry->offset;
}
+ /*
+ * We don't break here if we're using the bytes index because we
+ * may have another entry that has the correct alignment that is
+ * the right size, so we don't want to miss that possibility.
+ * At worst this adds another loop through the logic, but if we
+ * broke here we could prematurely ENOSPC.
+ */
if (entry->bytes < *bytes + align_off) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
@@ -1897,6 +2062,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bitmap) {
+ struct rb_node *old_next = rb_next(node);
u64 size = *bytes;
ret = search_bitmap(ctl, entry, &tmp, &size, true);
@@ -1909,6 +2075,15 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
max(get_max_extent_size(entry),
*max_extent_size);
}
+
+ /*
+ * The bitmap may have gotten re-arranged in the space
+ * index here because the max_extent_size may have been
+ * updated. Start from the beginning again if this
+ * happened.
+ */
+ if (use_bytes_index && old_next != rb_next(node))
+ goto again;
continue;
}
@@ -1920,29 +2095,6 @@ out:
return NULL;
}
-static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *bitmap_info)
-{
- struct btrfs_block_group *block_group = ctl->private;
- u64 bytes = bitmap_info->bytes;
- unsigned int rs, re;
- int count = 0;
-
- if (!block_group || !bytes)
- return count;
-
- bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
- BITS_PER_BITMAP) {
- bytes -= (rs - re) * ctl->unit;
- count++;
-
- if (!bytes)
- break;
- }
-
- return count;
-}
-
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
@@ -1952,8 +2104,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
-
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
}
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -1971,11 +2122,11 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
}
- unlink_free_space(ctl, bitmap_info);
+ unlink_free_space(ctl, bitmap_info, true);
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
}
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -2009,7 +2160,7 @@ again:
/* Cannot clear past the end of the bitmap */
search_bytes = min(search_bytes, end - search_start + 1);
- bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+ bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
*offset += search_bytes;
*bytes -= search_bytes;
@@ -2081,12 +2232,6 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
- /*
- * We set some bytes, we have no idea what the max extent size is
- * anymore.
- */
- info->max_extent_size = 0;
-
return bytes_to_set;
}
@@ -2094,7 +2239,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
@@ -2142,7 +2287,6 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
}
static const struct btrfs_free_space_op free_space_op = {
- .recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2164,7 +2308,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
return 0;
if (ctl->op == &free_space_op)
- block_group = ctl->private;
+ block_group = ctl->block_group;
again:
/*
* Since we link bitmaps right into the cluster we need to see if we
@@ -2287,7 +2431,7 @@ out:
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
@@ -2303,16 +2447,13 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
- else
+ else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
/* See try_merge_free_space() comment. */
if (right_info && !right_info->bitmap &&
(!is_trimmed || btrfs_free_space_trimmed(right_info))) {
- if (update_stat)
- unlink_free_space(ctl, right_info);
- else
- __unlink_free_space(ctl, right_info);
+ unlink_free_space(ctl, right_info, update_stat);
info->bytes += right_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, right_info);
merged = true;
@@ -2322,10 +2463,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (left_info && !left_info->bitmap &&
left_info->offset + left_info->bytes == offset &&
(!is_trimmed || btrfs_free_space_trimmed(left_info))) {
- if (update_stat)
- unlink_free_space(ctl, left_info);
- else
- __unlink_free_space(ctl, left_info);
+ unlink_free_space(ctl, left_info, update_stat);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, left_info);
@@ -2361,10 +2499,7 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
- if (update_stat)
- bitmap_clear_bits(ctl, bitmap, end, bytes);
- else
- __bitmap_clear_bits(ctl, bitmap, end, bytes);
+ bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
if (!bitmap->bytes)
free_bitmap(ctl, bitmap);
@@ -2418,10 +2553,7 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
- if (update_stat)
- bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
- else
- __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
+ bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
if (!bitmap->bytes)
free_bitmap(ctl, bitmap);
@@ -2465,16 +2597,18 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
}
}
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_free_space_ctl *ctl,
+int __btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
- struct btrfs_block_group *block_group = ctl->private;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
int ret = 0;
u64 filter_bytes = bytes;
+ ASSERT(!btrfs_is_zoned(fs_info));
+
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
return -ENOMEM;
@@ -2483,6 +2617,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
info->bytes = bytes;
info->trim_state = trim_state;
RB_CLEAR_NODE(&info->offset_index);
+ RB_CLEAR_NODE(&info->bytes_index);
spin_lock(&ctl->tree_lock);
@@ -2516,7 +2651,7 @@ link:
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
out:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (ret) {
@@ -2532,17 +2667,87 @@ out:
return ret;
}
+static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size, bool used)
+{
+ struct btrfs_space_info *sinfo = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 offset = bytenr - block_group->start;
+ u64 to_free, to_unusable;
+ int bg_reclaim_threshold = 0;
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
+
+ if (!initial)
+ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+
+ spin_lock(&ctl->tree_lock);
+ if (!used)
+ to_free = size;
+ else if (initial)
+ to_free = block_group->zone_capacity;
+ else if (offset >= block_group->alloc_offset)
+ to_free = size;
+ else if (offset + size <= block_group->alloc_offset)
+ to_free = 0;
+ else
+ to_free = offset + size - block_group->alloc_offset;
+ to_unusable = size - to_free;
+
+ ctl->free_space += to_free;
+ /*
+ * If the block group is read-only, we should account freed space into
+ * bytes_readonly.
+ */
+ if (!block_group->ro)
+ block_group->zone_unusable += to_unusable;
+ spin_unlock(&ctl->tree_lock);
+ if (!used) {
+ spin_lock(&block_group->lock);
+ block_group->alloc_offset -= size;
+ spin_unlock(&block_group->lock);
+ }
+
+ reclaimable_unusable = block_group->zone_unusable -
+ (block_group->length - block_group->zone_capacity);
+ /* All the region is now unusable. Mark it as unused and reclaim */
+ if (block_group->zone_unusable == block_group->length) {
+ btrfs_mark_bg_unused(block_group);
+ } else if (bg_reclaim_threshold &&
+ reclaimable_unusable >=
+ div_factor_fine(block_group->zone_capacity,
+ bg_reclaim_threshold)) {
+ btrfs_mark_bg_to_reclaim(block_group);
+ }
+
+ return 0;
+}
+
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size, trim_state);
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
+}
+
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ false);
+
+ return btrfs_add_free_space(block_group, bytenr, size);
}
/*
@@ -2555,13 +2760,15 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
- return __btrfs_add_free_space(block_group->fs_info,
- block_group->free_space_ctl,
- bytenr, size, trim_state);
+ return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
@@ -2572,6 +2779,26 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
int ret;
bool re_search = false;
+ if (btrfs_is_zoned(block_group->fs_info)) {
+ /*
+ * This can happen with conventional zones when replaying log.
+ * Since the allocation info of tree-log nodes are not recorded
+ * to the extent-tree, calculate_alloc_pointer() failed to
+ * advance the allocation pointer after last allocated tree log
+ * node blocks.
+ *
+ * This function is called from
+ * btrfs_pin_extent_for_log_replay() when replaying the log.
+ * Advance the pointer not to overwrite the tree-log nodes.
+ */
+ if (block_group->start + block_group->alloc_offset <
+ offset + bytes) {
+ block_group->alloc_offset =
+ offset + bytes - block_group->start;
+ }
+ return 0;
+ }
+
spin_lock(&ctl->tree_lock);
again:
@@ -2600,7 +2827,7 @@ again:
re_search = false;
if (!info->bitmap) {
- unlink_free_space(ctl, info);
+ unlink_free_space(ctl, info, true);
if (offset == info->offset) {
u64 to_free = min(bytes, info->bytes);
@@ -2636,7 +2863,7 @@ again:
}
spin_unlock(&ctl->tree_lock);
- ret = __btrfs_add_free_space(block_group->fs_info, ctl,
+ ret = __btrfs_add_free_space(block_group,
offset + bytes,
old_end - (offset + bytes),
info->trim_state);
@@ -2651,7 +2878,7 @@ again:
goto again;
}
out_lock:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
out:
return ret;
@@ -2666,6 +2893,18 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
struct rb_node *n;
int count = 0;
+ /*
+ * Zoned btrfs does not use free space tree and cluster. Just print
+ * out the free space after the allocation offset.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_info(fs_info, "free space %llu active %d",
+ block_group->zone_capacity - block_group->alloc_offset,
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags));
+ return;
+ }
+
spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
@@ -2682,16 +2921,17 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
"%d blocks of free space at or bigger than bytes is", count);
}
-void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
ctl->start = block_group->start;
- ctl->private = block_group;
+ ctl->block_group = block_group;
ctl->op = &free_space_op;
+ ctl->free_space_bytes = RB_ROOT_CACHED;
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
@@ -2709,8 +2949,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
-static int
-__btrfs_return_cluster_to_free_space(
+static void __btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
@@ -2719,8 +2958,10 @@ __btrfs_return_cluster_to_free_space(
struct rb_node *node;
spin_lock(&cluster->lock);
- if (cluster->block_group != block_group)
- goto out;
+ if (cluster->block_group != block_group) {
+ spin_unlock(&cluster->lock);
+ return;
+ }
cluster->block_group = NULL;
cluster->window_start = 0;
@@ -2756,41 +2997,12 @@ __btrfs_return_cluster_to_free_space(
}
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
+ rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
+ entry_less);
}
cluster->root = RB_ROOT;
-
-out:
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
- return 0;
-}
-
-static void __btrfs_remove_free_space_cache_locked(
- struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *node;
-
- while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, offset_index);
- if (!info->bitmap) {
- unlink_free_space(ctl, info);
- kmem_cache_free(btrfs_free_space_cachep, info);
- } else {
- free_bitmap(ctl, info);
- }
-
- cond_resched_lock(&ctl->tree_lock);
- }
-}
-
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
-{
- spin_lock(&ctl->tree_lock);
- __btrfs_remove_free_space_cache_locked(ctl);
- if (ctl->private)
- btrfs_discard_update_discardable(ctl->private, ctl);
- spin_unlock(&ctl->tree_lock);
}
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
@@ -2810,8 +3022,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
- __btrfs_remove_free_space_cache_locked(ctl);
- btrfs_discard_update_discardable(block_group, ctl);
+ __btrfs_remove_free_space_cache(ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
}
@@ -2860,16 +3072,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 align_gap = 0;
u64 align_gap_len = 0;
enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ bool use_bytes_index = (offset == block_group->start);
+
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
- block_group->full_stripe_len, max_extent_size);
+ block_group->full_stripe_len, max_extent_size,
+ use_bytes_index);
if (!entry)
goto out;
ret = offset;
if (entry->bitmap) {
- bitmap_clear_bits(ctl, entry, offset, bytes);
+ bitmap_clear_bits(ctl, entry, offset, bytes, true);
if (!btrfs_free_space_trimmed(entry))
atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
@@ -2877,7 +3093,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
align_gap_len = offset - entry->offset;
align_gap = entry->offset;
align_gap_trim_state = entry->trim_state;
@@ -2895,12 +3111,11 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
link_free_space(ctl, entry);
}
out:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
- __btrfs_add_free_space(block_group->fs_info, ctl,
- align_gap, align_gap_len,
+ __btrfs_add_free_space(block_group, align_gap, align_gap_len,
align_gap_trim_state);
return ret;
}
@@ -2913,12 +3128,11 @@ out:
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
-int btrfs_return_cluster_to_free_space(
+void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
- int ret;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
@@ -2926,28 +3140,27 @@ int btrfs_return_cluster_to_free_space(
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
- return 0;
+ return;
}
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
spin_unlock(&cluster->lock);
ctl = block_group->free_space_ctl;
/* now return any extents the cluster had on it */
spin_lock(&ctl->tree_lock);
- ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+ __btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
/* finally drop our ref */
btrfs_put_block_group(block_group);
- return ret;
}
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
@@ -2973,7 +3186,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
}
ret = search_start;
- __bitmap_clear_bits(ctl, entry, ret, bytes);
+ bitmap_clear_bits(ctl, entry, ret, bytes, false);
return ret;
}
@@ -2994,6 +3207,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct rb_node *node;
u64 ret = 0;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
@@ -3042,8 +3257,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
entry->bytes -= bytes;
}
- if (entry->bytes == 0)
- rb_erase(&entry->offset_index, &cluster->root);
break;
}
out:
@@ -3060,19 +3273,23 @@ out:
ctl->free_space -= bytes;
if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+
+ spin_lock(&cluster->lock);
if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
ctl->free_extents--;
if (entry->bitmap) {
kmem_cache_free(btrfs_free_space_bitmap_cachep,
entry->bitmap);
ctl->total_bitmaps--;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
} else if (!btrfs_free_space_trimmed(entry)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
}
kmem_cache_free(btrfs_free_space_cachep, entry);
}
+ spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;
@@ -3145,6 +3362,17 @@ again:
cluster->window_start = start * ctl->unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
+
+ /*
+ * We need to know if we're currently on the normal space index when we
+ * manipulate the bitmap so that we know we need to remove and re-insert
+ * it into the space_index tree. Clear the bytes_index node here so the
+ * bitmap manipulation helpers know not to mess with the space_index
+ * until this bitmap entry is added back into the normal cache.
+ */
+ RB_CLEAR_NODE(&entry->bytes_index);
+
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
ASSERT(!ret); /* -EEXIST; Logic error */
@@ -3235,6 +3463,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
total_size += entry->bytes;
@@ -3320,7 +3549,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
* data, keep it dense.
*/
if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
- cont1_bytes = min_bytes = bytes + empty_size;
+ cont1_bytes = bytes + empty_size;
+ min_bytes = cont1_bytes;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
min_bytes = fs_info->sectorsize;
@@ -3364,7 +3594,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
list_del_init(&entry->list);
if (!ret) {
- atomic_inc(&block_group->count);
+ btrfs_get_block_group(block_group);
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
@@ -3426,13 +3656,13 @@ static int do_trimming(struct btrfs_block_group *block_group,
mutex_lock(&ctl->cache_writeout_mutex);
if (reserved_start < start)
- __btrfs_add_free_space(fs_info, ctl, reserved_start,
+ __btrfs_add_free_space(block_group, reserved_start,
start - reserved_start,
reserved_trim_state);
if (start + bytes < reserved_start + reserved_bytes)
- __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
+ __btrfs_add_free_space(block_group, end, reserved_end - end,
reserved_trim_state);
- __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
+ __btrfs_add_free_space(block_group, start, bytes, trim_state);
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3506,7 +3736,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
/*
* Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
* If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
@@ -3532,7 +3762,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
goto next;
}
- unlink_free_space(ctl, entry);
+ unlink_free_space(ctl, entry, true);
kmem_cache_free(btrfs_free_space_cachep, entry);
}
@@ -3719,7 +3949,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
bytes > (max_discard_size + minlen))
bytes = max_discard_size;
- bitmap_clear_bits(ctl, entry, start, bytes);
+ bitmap_clear_bits(ctl, entry, start, bytes, true);
if (entry->bytes == 0)
free_bitmap(ctl, entry);
@@ -3763,46 +3993,6 @@ out:
return ret;
}
-void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
-{
- atomic_inc(&cache->trimming);
-}
-
-void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
-{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- bool cleanup;
-
- spin_lock(&block_group->lock);
- cleanup = (atomic_dec_and_test(&block_group->trimming) &&
- block_group->removed);
- spin_unlock(&block_group->lock);
-
- if (cleanup) {
- mutex_lock(&fs_info->chunk_mutex);
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- mutex_unlock(&fs_info->chunk_mutex);
-
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
-
- /*
- * We've left one free space entry and other tasks trimming
- * this block group have left 1 entry each one. Free them.
- */
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
- }
-}
-
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
@@ -3810,14 +4000,16 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
int ret;
u64 rem = 0;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
- btrfs_get_block_group_trimming(block_group);
+ btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
@@ -3830,7 +4022,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
if (rem)
reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
out:
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
return ret;
}
@@ -3843,15 +4035,15 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
- btrfs_get_block_group_trimming(block_group);
+ btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
return ret;
}
@@ -3865,183 +4057,77 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
- btrfs_get_block_group_trimming(block_group);
+ btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
async);
- btrfs_put_block_group_trimming(block_group);
+ btrfs_unfreeze_block_group(block_group);
return ret;
}
-/*
- * Find the left-most item in the cache tree, and then return the
- * smallest inode number in the item.
- *
- * Note: the returned inode number may not be the smallest one in
- * the tree, if the left-most item is a bitmap.
- */
-u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
-{
- struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
- struct btrfs_free_space *entry = NULL;
- u64 ino = 0;
-
- spin_lock(&ctl->tree_lock);
-
- if (RB_EMPTY_ROOT(&ctl->free_space_offset))
- goto out;
-
- entry = rb_entry(rb_first(&ctl->free_space_offset),
- struct btrfs_free_space, offset_index);
-
- if (!entry->bitmap) {
- ino = entry->offset;
-
- unlink_free_space(ctl, entry);
- entry->offset++;
- entry->bytes--;
- if (!entry->bytes)
- kmem_cache_free(btrfs_free_space_cachep, entry);
- else
- link_free_space(ctl, entry);
- } else {
- u64 offset = 0;
- u64 count = 1;
- int ret;
-
- ret = search_bitmap(ctl, entry, &offset, &count, true);
- /* Logic error; Should be empty if it can't find anything */
- ASSERT(!ret);
-
- ino = offset;
- bitmap_clear_bits(ctl, entry, offset, 1);
- if (entry->bytes == 0)
- free_bitmap(ctl, entry);
- }
-out:
- spin_unlock(&ctl->tree_lock);
-
- return ino;
-}
-
-struct inode *lookup_free_ino_inode(struct btrfs_root *root,
- struct btrfs_path *path)
-{
- struct inode *inode = NULL;
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_inode)
- inode = igrab(root->ino_cache_inode);
- spin_unlock(&root->ino_cache_lock);
- if (inode)
- return inode;
-
- inode = __lookup_free_space_inode(root, path, 0);
- if (IS_ERR(inode))
- return inode;
-
- spin_lock(&root->ino_cache_lock);
- if (!btrfs_fs_closing(root->fs_info))
- root->ino_cache_inode = igrab(inode);
- spin_unlock(&root->ino_cache_lock);
-
- return inode;
-}
-
-int create_free_ino_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path)
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
- return __create_free_space_inode(root, trans, path,
- BTRFS_FREE_INO_OBJECTID, 0);
+ return btrfs_super_cache_generation(fs_info->super_copy);
}
-int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_path *path;
- struct inode *inode;
+ struct btrfs_block_group *block_group;
+ struct rb_node *node;
int ret = 0;
- u64 root_gen = btrfs_root_generation(&root->root_item);
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
- /*
- * If we're unmounting then just return, since this does a search on the
- * normal root and not the commit root and we could deadlock.
- */
- if (btrfs_fs_closing(fs_info))
- return 0;
+ btrfs_info(fs_info, "cleaning free space cache v1");
- path = btrfs_alloc_path();
- if (!path)
- return 0;
-
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode))
- goto out;
-
- if (root_gen != BTRFS_I(inode)->generation)
- goto out_put;
-
- ret = __load_free_space_cache(root, inode, ctl, path, 0);
-
- if (ret < 0)
- btrfs_err(fs_info,
- "failed to load free ino cache for root %llu",
- root->root_key.objectid);
-out_put:
- iput(inode);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group, cache_node);
+ ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
+ if (ret)
+ goto out;
+ node = rb_next(node);
+ }
out:
- btrfs_free_path(path);
return ret;
}
-int btrfs_write_out_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode)
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_trans_handle *trans;
int ret;
- struct btrfs_io_ctl io_ctl;
- bool release_metadata = true;
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
+ /*
+ * update_super_roots will appropriately set or unset
+ * super_copy->cache_generation based on SPACE_CACHE and
+ * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
+ * transaction commit whether we are enabling space cache v1 and don't
+ * have any other work to do, or are disabling it and removing free
+ * space inodes.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- memset(&io_ctl, 0, sizeof(io_ctl));
- ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
- if (!ret) {
- /*
- * At this point writepages() didn't error out, so our metadata
- * reservation is released when the writeback finishes, at
- * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
- * with or without an error.
- */
- release_metadata = false;
- ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
+ if (!active) {
+ set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
+ ret = cleanup_free_space_cache_v1(fs_info, trans);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
}
- if (ret) {
- if (release_metadata)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- inode->i_size, true);
-#ifdef DEBUG
- btrfs_err(fs_info,
- "failed to write free ino cache for root %llu",
- root->root_key.objectid);
-#endif
- }
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 2e0a8077aa74..6d419ba53e95 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -22,6 +22,7 @@ enum btrfs_trim_state {
struct btrfs_free_space {
struct rb_node offset_index;
+ struct rb_node bytes_index;
u64 offset;
u64 bytes;
u64 max_extent_size;
@@ -45,6 +46,7 @@ static inline bool btrfs_free_space_trimming_bitmap(
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
+ struct rb_root_cached free_space_bytes;
u64 free_space;
int extents_thresh;
int free_extents;
@@ -54,13 +56,12 @@ struct btrfs_free_space_ctl {
s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
const struct btrfs_free_space_op *op;
- void *private;
+ struct btrfs_block_group *block_group;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
};
struct btrfs_free_space_op {
- void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
};
@@ -76,7 +77,6 @@ struct btrfs_io_ctl {
int num_pages;
int entries;
int bitmaps;
- unsigned check_crcs:1;
};
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -84,6 +84,9 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
int create_free_space_inode(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
@@ -97,36 +100,24 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
-struct inode *lookup_free_ino_inode(struct btrfs_root *root,
- struct btrfs_path *path);
-int create_free_ino_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path);
-int load_free_ino_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_root *root);
-int btrfs_write_out_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode);
-void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
-int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_free_space_ctl *ctl,
- u64 bytenr, u64 size,
- enum btrfs_trim_state trim_state);
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl);
+int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr,
+ u64 size, enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
-u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes);
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
@@ -136,7 +127,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
-int btrfs_return_cluster_to_free_space(
+void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
@@ -148,6 +139,8 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async);
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int test_add_free_space_entry(struct btrfs_block_group *cache,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 258cb3fae17a..367bcfcf68f5 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -16,12 +16,30 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
+static struct btrfs_root *btrfs_free_space_root(
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+
+ if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
+ key.offset = block_group->global_root_id;
+ return btrfs_global_root(block_group->fs_info, &key);
+}
+
void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
@@ -47,7 +65,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -81,7 +99,7 @@ struct btrfs_free_space_info *search_free_space_info(
struct btrfs_path *path, int cow)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
int ret;
@@ -132,9 +150,10 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
return 0;
}
-static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info,
+ u64 size)
{
- return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+ return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE);
}
static unsigned long *alloc_bitmap(u32 bitmap_size)
@@ -183,7 +202,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -196,8 +215,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->length,
- fs_info->sectorsize);
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -286,8 +304,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
u32 data_size;
extent_size = min(end - i, bitmap_range);
- data_size = free_space_bitmap_size(extent_size,
- fs_info->sectorsize);
+ data_size = free_space_bitmap_size(fs_info, extent_size);
key.objectid = i;
key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
@@ -323,7 +340,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -335,8 +352,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->length,
- fs_info->sectorsize);
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -379,8 +395,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
fs_info->sectorsize *
BITS_PER_BYTE);
bitmap_cursor = ((char *)bitmap) + bitmap_pos;
- data_size = free_space_bitmap_size(found_key.offset,
- fs_info->sectorsize);
+ data_size = free_space_bitmap_size(fs_info,
+ found_key.offset);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
read_extent_buffer(leaf, bitmap_cursor, ptr,
@@ -412,7 +428,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize);
+ nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
@@ -536,8 +552,8 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
end = found_end;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- first = div_u64(*start - found_start, fs_info->sectorsize);
- last = div_u64(end - found_start, fs_info->sectorsize);
+ first = (*start - found_start) >> fs_info->sectorsize_bits;
+ last = (end - found_start) >> fs_info->sectorsize_bits;
if (bit)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
@@ -584,7 +600,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
- struct btrfs_root *root = block_group->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
@@ -697,7 +713,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 found_start, found_end;
u64 end = start + size;
@@ -849,7 +865,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 start, u64 size)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key, new_key;
u64 found_start, found_end;
u64 end = start + size;
@@ -1044,7 +1060,7 @@ out:
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
- struct btrfs_root *extent_root = trans->fs_info->extent_root;
+ struct btrfs_root *extent_root;
struct btrfs_path *path, *path2;
struct btrfs_key key;
u64 start, end;
@@ -1078,6 +1094,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
+ extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
@@ -1148,15 +1165,20 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
free_space_root = btrfs_create_tree(trans,
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
ret = PTR_ERR(free_space_root);
goto abort;
}
- fs_info->free_space_root = free_space_root;
+ ret = btrfs_global_root_insert(free_space_root);
+ if (ret) {
+ btrfs_put_root(free_space_root);
+ goto abort;
+ }
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
@@ -1169,11 +1191,18 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ ret = btrfs_commit_transaction(trans);
- return btrfs_commit_transaction(trans);
+ /*
+ * Now that we've committed the transaction any reading of our commit
+ * root will be safe, so we can cache from the free space tree now.
+ */
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ return ret;
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
@@ -1191,8 +1220,6 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
key.objectid = 0;
key.type = 0;
key.offset = 0;
@@ -1224,7 +1251,12 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_root *free_space_root = fs_info->free_space_root;
+ struct btrfs_key key = {
+ .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
+ .type = BTRFS_ROOT_ITEM_KEY,
+ .offset = 0,
+ };
+ struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
int ret;
trans = btrfs_start_transaction(tree_root, 0);
@@ -1233,7 +1265,6 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
- fs_info->free_space_root = NULL;
ret = clear_free_space_tree(trans, free_space_root);
if (ret)
@@ -1243,17 +1274,16 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto abort;
+ btrfs_global_root_delete(free_space_root);
list_del(&free_space_root->dirty_list);
btrfs_tree_lock(free_space_root->node);
btrfs_clean_tree_block(free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
- btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
- 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
- free_extent_buffer(free_space_root->node);
- free_extent_buffer(free_space_root->commit_root);
- kfree(free_space_root);
+ btrfs_put_root(free_space_root);
return btrfs_commit_transaction(trans);
@@ -1313,7 +1343,7 @@ out:
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
- struct btrfs_root *root = trans->fs_info->free_space_root;
+ struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_path *path;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
@@ -1404,7 +1434,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- root = fs_info->free_space_root;
+ root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
@@ -1423,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
offset = key.objectid;
while (offset < key.objectid + key.offset) {
bit = free_space_test_bit(block_group, path, offset);
@@ -1460,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
@@ -1482,7 +1508,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- root = fs_info->free_space_root;
+ root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
@@ -1501,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1522,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 668701832845..0eeb5ea87894 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,6 +4,7 @@
*/
#include "ctree.h"
+#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
@@ -19,7 +20,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
u32 cur_offset = 0;
int len;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
@@ -45,7 +46,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
u32 cur_offset = 0;
int ref_name_len;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
/*
@@ -119,8 +120,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
ret = -ENOENT;
@@ -141,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (index)
*index = btrfs_inode_extref_index(leaf, extref);
@@ -193,8 +192,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
@@ -212,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
if (index)
*index = btrfs_inode_ref_index(leaf, ref);
@@ -260,7 +257,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_item *item;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
@@ -270,7 +266,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -287,9 +282,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
goto out;
leaf = path->nodes[0];
- item = btrfs_item_nr(path->slots[0]);
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
- ptr += btrfs_item_size(leaf, item) - ins_len;
+ ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
@@ -327,7 +321,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
@@ -338,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ref)
goto out;
- old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+ old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
btrfs_extend_item(path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
@@ -425,3 +418,332 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
}
return ret;
}
+
+static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
+ struct extent_buffer *leaf,
+ struct btrfs_file_extent_item *fi,
+ u64 offset, int extent_type, int slot)
+{
+ if (!inode)
+ return;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot,
+ offset);
+ else
+ trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset);
+}
+
+/*
+ * Remove inode items from a given root.
+ *
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @control: The btrfs_truncate_control to control how and what we
+ * are truncating.
+ *
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
+ */
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 new_size = control->new_size;
+ u64 extent_num_bytes = 0;
+ u64 extent_offset = 0;
+ u64 item_end = 0;
+ u32 found_type = (u8)-1;
+ int del_item;
+ int pending_del_nr = 0;
+ int pending_del_slot = 0;
+ int extent_type = -1;
+ int ret;
+ u64 bytes_deleted = 0;
+ bool be_nice = false;
+
+ ASSERT(control->inode || !control->clear_extent_range);
+ ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);
+
+ control->last_size = new_size;
+ control->sub_bytes = 0;
+
+ /*
+ * For shareable roots we want to back off from time to time, this turns
+ * out to be subvolume roots, reloc roots, and data reloc roots.
+ */
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ be_nice = true;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_BACK;
+
+ key.objectid = control->ino;
+ key.offset = (u64)-1;
+ key.type = (u8)-1;
+
+search_again:
+ /*
+ * With a 16K leaf size and 128MiB extents, you can actually queue up a
+ * huge file in a single leaf. Most of the time that bytes_deleted is
+ * > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > SZ_32M &&
+ btrfs_should_end_transaction(trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ if (ret > 0) {
+ ret = 0;
+ /* There are no items in the tree for us to truncate, we're done */
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (1) {
+ u64 clear_start = 0, clear_len = 0, extent_start = 0;
+ bool should_throttle = false;
+
+ fi = NULL;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ found_type = found_key.type;
+
+ if (found_key.objectid != control->ino)
+ break;
+
+ if (found_type < control->min_type)
+ break;
+
+ item_end = found_key.offset;
+ if (found_type == BTRFS_EXTENT_DATA_KEY) {
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE)
+ item_end +=
+ btrfs_file_extent_num_bytes(leaf, fi);
+ else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ item_end += btrfs_file_extent_ram_bytes(leaf, fi);
+
+ btrfs_trace_truncate(control->inode, leaf, fi,
+ found_key.offset, extent_type,
+ path->slots[0]);
+ item_end--;
+ }
+ if (found_type > control->min_type) {
+ del_item = 1;
+ } else {
+ if (item_end < new_size)
+ break;
+ if (found_key.offset >= new_size)
+ del_item = 1;
+ else
+ del_item = 0;
+ }
+
+ /* FIXME, shrink the extent if the ref count is only 1 */
+ if (found_type != BTRFS_EXTENT_DATA_KEY)
+ goto delete;
+
+ control->extents_found++;
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ u64 num_dec;
+
+ clear_start = found_key.offset;
+ extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (!del_item) {
+ u64 orig_num_bytes =
+ btrfs_file_extent_num_bytes(leaf, fi);
+ extent_num_bytes = ALIGN(new_size -
+ found_key.offset,
+ fs_info->sectorsize);
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
+
+ btrfs_set_file_extent_num_bytes(leaf, fi,
+ extent_num_bytes);
+ num_dec = (orig_num_bytes - extent_num_bytes);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
+ extent_num_bytes =
+ btrfs_file_extent_disk_num_bytes(leaf, fi);
+ extent_offset = found_key.offset -
+ btrfs_file_extent_offset(leaf, fi);
+
+ /* FIXME blocksize != 4096 */
+ num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_start != 0)
+ control->sub_bytes += num_dec;
+ }
+ clear_len = num_dec;
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * We can't truncate inline items that have had
+ * special encodings
+ */
+ if (!del_item &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+ btrfs_file_extent_compression(leaf, fi) == 0) {
+ u32 size = (u32)(new_size - found_key.offset);
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(path, size, 1);
+ } else if (!del_item) {
+ /*
+ * We have to bail so the last_size is set to
+ * just before this extent.
+ */
+ ret = BTRFS_NEED_TRUNCATE_BLOCK;
+ break;
+ } else {
+ /*
+ * Inline extents are special, we just treat
+ * them as a full sector worth in the file
+ * extent tree just for simplicity sake.
+ */
+ clear_len = fs_info->sectorsize;
+ }
+
+ control->sub_bytes += item_end + 1 - new_size;
+ }
+delete:
+ /*
+ * We only want to clear the file extent range if we're
+ * modifying the actual inode's mapping, which is just the
+ * normal truncate path.
+ */
+ if (control->clear_extent_range) {
+ ret = btrfs_inode_clear_file_extent_range(control->inode,
+ clear_start, clear_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (del_item) {
+ ASSERT(!pending_del_nr ||
+ ((path->slots[0] + 1) == pending_del_slot));
+
+ control->last_size = found_key.offset;
+ if (!pending_del_nr) {
+ /* No pending yet, add ourselves */
+ pending_del_slot = path->slots[0];
+ pending_del_nr = 1;
+ } else if (pending_del_nr &&
+ path->slots[0] + 1 == pending_del_slot) {
+ /* Hop on the pending chunk */
+ pending_del_nr++;
+ pending_del_slot = path->slots[0];
+ }
+ } else {
+ control->last_size = new_size;
+ break;
+ }
+
+ if (del_item && extent_start != 0 && !control->skip_ref_updates) {
+ struct btrfs_ref ref = { 0 };
+
+ bytes_deleted += extent_num_bytes;
+
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
+ extent_start, extent_num_bytes, 0);
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
+ control->ino, extent_offset,
+ root->root_key.objectid, false);
+ ret = btrfs_free_extent(trans, &ref);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ if (be_nice) {
+ if (btrfs_should_throttle_delayed_refs(trans))
+ should_throttle = true;
+ }
+ }
+
+ if (found_type == BTRFS_INODE_ITEM_KEY)
+ break;
+
+ if (path->slots[0] == 0 ||
+ path->slots[0] != pending_del_slot ||
+ should_throttle) {
+ if (pending_del_nr) {
+ ret = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ pending_del_nr = 0;
+ }
+ btrfs_release_path(path);
+
+ /*
+ * We can generate a lot of delayed refs, so we need to
+ * throttle every once and a while and make sure we're
+ * adding enough space to keep up with the work we are
+ * generating. Since we hold a transaction here we
+ * can't flush, and we don't want to FLUSH_LIMIT because
+ * we could have generated too many delayed refs to
+ * actually allocate, so just bail if we're short and
+ * let the normal reservation dance happen higher up.
+ */
+ if (should_throttle) {
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ BTRFS_RESERVE_NO_FLUSH);
+ if (ret) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ goto search_again;
+ } else {
+ path->slots[0]--;
+ }
+ }
+out:
+ if (ret >= 0 && pending_del_nr) {
+ int err;
+
+ err = btrfs_del_items(trans, root, path, pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans, err);
+ ret = err;
+ }
+ }
+
+ ASSERT(control->last_size >= new_size);
+ if (!ret && control->last_size > new_size)
+ control->last_size = new_size;
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
new file mode 100644
index 000000000000..a8fc16d0147f
--- /dev/null
+++ b/fs/btrfs/inode-item.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_INODE_ITEM_H
+#define BTRFS_INODE_ITEM_H
+
+#include <linux/types.h>
+
+struct btrfs_trans_handle;
+struct btrfs_root;
+struct btrfs_path;
+struct btrfs_key;
+struct btrfs_inode_extref;
+struct btrfs_inode;
+struct extent_buffer;
+
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define BTRFS_NEED_TRUNCATE_BLOCK 1
+
+struct btrfs_truncate_control {
+ /*
+ * IN: the inode we're operating on, this can be NULL if
+ * ->clear_extent_range is false.
+ */
+ struct btrfs_inode *inode;
+
+ /* IN: the size we're truncating to. */
+ u64 new_size;
+
+ /* OUT: the number of extents truncated. */
+ u64 extents_found;
+
+ /* OUT: the last size we truncated this inode to. */
+ u64 last_size;
+
+ /* OUT: the number of bytes to sub from this inode. */
+ u64 sub_bytes;
+
+ /* IN: the ino we are truncating. */
+ u64 ino;
+
+ /*
+ * IN: minimum key type to remove. All key types with this type are
+ * removed only if their offset >= new_size.
+ */
+ u32 min_type;
+
+ /*
+ * IN: true if we don't want to do extent reference updates for any file
+ * extents we drop.
+ */
+ bool skip_ref_updates;
+
+ /*
+ * IN: true if we need to clear the file extent range for the inode as
+ * we drop the file extent items.
+ */
+ bool clear_extent_range;
+};
+
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_truncate_control *control);
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path,
+ struct btrfs_key *location, int mod);
+
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int ins_len,
+ int cow);
+
+struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+ int slot, const char *name,
+ int name_len);
+struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
+ struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const char *name, int name_len);
+
+#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
deleted file mode 100644
index d5c9c69d8263..000000000000
--- a/fs/btrfs/inode-map.c
+++ /dev/null
@@ -1,581 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- */
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-
-#include "ctree.h"
-#include "disk-io.h"
-#include "free-space-cache.h"
-#include "inode-map.h"
-#include "transaction.h"
-#include "delalloc-space.h"
-
-static void fail_caching_thread(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
- btrfs_warn(fs_info, "failed to start inode caching task");
- btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
- "disabling inode map caching");
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_ERROR;
- spin_unlock(&root->ino_cache_lock);
- wake_up(&root->ino_cache_wait);
-}
-
-static int caching_kthread(void *data)
-{
- struct btrfs_root *root = data;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- u64 last = (u64)-1;
- int slot;
- int ret;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path) {
- fail_caching_thread(root);
- return -ENOMEM;
- }
-
- /* Since the commit root is read-only, we can safely skip locking. */
- path->skip_locking = 1;
- path->search_commit_root = 1;
- path->reada = READA_FORWARD;
-
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
-again:
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- if (btrfs_fs_closing(fs_info))
- goto out;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
-
- if (need_resched() ||
- btrfs_transaction_in_commit(fs_info)) {
- leaf = path->nodes[0];
-
- if (WARN_ON(btrfs_header_nritems(leaf) == 0))
- break;
-
- /*
- * Save the key so we can advances forward
- * in the next search.
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(path);
- root->ino_cache_progress = last;
- up_read(&fs_info->commit_root_sem);
- schedule_timeout(1);
- goto again;
- } else
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- if (key.type != BTRFS_INODE_ITEM_KEY)
- goto next;
-
- if (key.objectid >= root->highest_objectid)
- break;
-
- if (last != (u64)-1 && last + 1 != key.objectid) {
- __btrfs_add_free_space(fs_info, ctl, last + 1,
- key.objectid - last - 1, 0);
- wake_up(&root->ino_cache_wait);
- }
-
- last = key.objectid;
-next:
- path->slots[0]++;
- }
-
- if (last < root->highest_objectid - 1) {
- __btrfs_add_free_space(fs_info, ctl, last + 1,
- root->highest_objectid - last - 1, 0);
- }
-
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->ino_cache_lock);
-
- root->ino_cache_progress = (u64)-1;
- btrfs_unpin_free_ino(root);
-out:
- wake_up(&root->ino_cache_wait);
- up_read(&fs_info->commit_root_sem);
-
- btrfs_free_path(path);
-
- return ret;
-}
-
-static void start_caching(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct task_struct *tsk;
- int ret;
- u64 objectid;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return;
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state != BTRFS_CACHE_NO) {
- spin_unlock(&root->ino_cache_lock);
- return;
- }
-
- root->ino_cache_state = BTRFS_CACHE_STARTED;
- spin_unlock(&root->ino_cache_lock);
-
- ret = load_free_ino_cache(fs_info, root);
- if (ret == 1) {
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->ino_cache_lock);
- wake_up(&root->ino_cache_wait);
- return;
- }
-
- /*
- * It can be quite time-consuming to fill the cache by searching
- * through the extent tree, and this can keep ino allocation path
- * waiting. Therefore at start we quickly find out the highest
- * inode number and we know we can use inode numbers which fall in
- * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
- */
- ret = btrfs_find_free_objectid(root, &objectid);
- if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
- __btrfs_add_free_space(fs_info, ctl, objectid,
- BTRFS_LAST_FREE_OBJECTID - objectid + 1,
- 0);
- wake_up(&root->ino_cache_wait);
- }
-
- tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
- root->root_key.objectid);
- if (IS_ERR(tsk))
- fail_caching_thread(root);
-}
-
-int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
-{
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
- return btrfs_find_free_objectid(root, objectid);
-
-again:
- *objectid = btrfs_find_ino_for_alloc(root);
-
- if (*objectid != 0)
- return 0;
-
- start_caching(root);
-
- wait_event(root->ino_cache_wait,
- root->ino_cache_state == BTRFS_CACHE_FINISHED ||
- root->ino_cache_state == BTRFS_CACHE_ERROR ||
- root->free_ino_ctl->free_space > 0);
-
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
- root->free_ino_ctl->free_space == 0)
- return -ENOSPC;
- else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
- return btrfs_find_free_objectid(root, objectid);
- else
- goto again;
-}
-
-void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return;
-again:
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
- } else {
- down_write(&fs_info->commit_root_sem);
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- spin_unlock(&root->ino_cache_lock);
- up_write(&fs_info->commit_root_sem);
- goto again;
- }
- spin_unlock(&root->ino_cache_lock);
-
- start_caching(root);
-
- __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
-
- up_write(&fs_info->commit_root_sem);
- }
-}
-
-/*
- * When a transaction is committed, we'll move those inode numbers which are
- * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
- * others will just be dropped, because the commit root we were searching has
- * changed.
- *
- * Must be called with root->fs_info->commit_root_sem held
- */
-void btrfs_unpin_free_ino(struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
- spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
- struct btrfs_free_space *info;
- struct rb_node *n;
- u64 count;
-
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
- return;
-
- while (1) {
- spin_lock(rbroot_lock);
- n = rb_first(rbroot);
- if (!n) {
- spin_unlock(rbroot_lock);
- break;
- }
-
- info = rb_entry(n, struct btrfs_free_space, offset_index);
- BUG_ON(info->bitmap); /* Logic error */
-
- if (info->offset > root->ino_cache_progress)
- count = 0;
- else
- count = min(root->ino_cache_progress - info->offset + 1,
- info->bytes);
-
- rb_erase(&info->offset_index, rbroot);
- spin_unlock(rbroot_lock);
- if (count)
- __btrfs_add_free_space(root->fs_info, ctl,
- info->offset, count, 0);
- kmem_cache_free(btrfs_free_space_cachep, info);
- }
-}
-
-#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
-#define INODES_PER_BITMAP (PAGE_SIZE * 8)
-
-/*
- * The goal is to keep the memory used by the free_ino tree won't
- * exceed the memory if we use bitmaps only.
- */
-static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *n;
- int max_ino;
- int max_bitmaps;
-
- n = rb_last(&ctl->free_space_offset);
- if (!n) {
- ctl->extents_thresh = INIT_THRESHOLD;
- return;
- }
- info = rb_entry(n, struct btrfs_free_space, offset_index);
-
- /*
- * Find the maximum inode number in the filesystem. Note we
- * ignore the fact that this can be a bitmap, because we are
- * not doing precise calculation.
- */
- max_ino = info->bytes - 1;
-
- max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
- if (max_bitmaps <= ctl->total_bitmaps) {
- ctl->extents_thresh = 0;
- return;
- }
-
- ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
- PAGE_SIZE / sizeof(*info);
-}
-
-/*
- * We don't fall back to bitmap, if we are below the extents threshold
- * or this chunk of inode numbers is a big one.
- */
-static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- if (ctl->free_extents < ctl->extents_thresh ||
- info->bytes > INODES_PER_BITMAP / 10)
- return false;
-
- return true;
-}
-
-static const struct btrfs_free_space_op free_ino_op = {
- .recalc_thresholds = recalculate_thresholds,
- .use_bitmap = use_bitmap,
-};
-
-static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
-{
-}
-
-static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- /*
- * We always use extents for two reasons:
- *
- * - The pinned tree is only used during the process of caching
- * work.
- * - Make code simpler. See btrfs_unpin_free_ino().
- */
- return false;
-}
-
-static const struct btrfs_free_space_op pinned_free_ino_op = {
- .recalc_thresholds = pinned_recalc_thresholds,
- .use_bitmap = pinned_use_bitmap,
-};
-
-void btrfs_init_free_ino_ctl(struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
-
- spin_lock_init(&ctl->tree_lock);
- ctl->unit = 1;
- ctl->start = 0;
- ctl->private = NULL;
- ctl->op = &free_ino_op;
- INIT_LIST_HEAD(&ctl->trimming_ranges);
- mutex_init(&ctl->cache_writeout_mutex);
-
- /*
- * Initially we allow to use 16K of ram to cache chunks of
- * inode numbers before we resort to bitmaps. This is somewhat
- * arbitrary, but it will be adjusted in runtime.
- */
- ctl->extents_thresh = INIT_THRESHOLD;
-
- spin_lock_init(&pinned->tree_lock);
- pinned->unit = 1;
- pinned->start = 0;
- pinned->private = NULL;
- pinned->extents_thresh = 0;
- pinned->op = &pinned_free_ino_op;
-}
-
-int btrfs_save_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_path *path;
- struct inode *inode;
- struct btrfs_block_rsv *rsv;
- struct extent_changeset *data_reserved = NULL;
- u64 num_bytes;
- u64 alloc_hint = 0;
- int ret;
- int prealloc;
- bool retry = false;
-
- /* only fs tree and subvol/snap needs ino cache */
- if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
- (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
- root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
- return 0;
-
- /* Don't save inode cache if we are deleting this root */
- if (btrfs_root_refs(&root->root_item) == 0)
- return 0;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- rsv = trans->block_rsv;
- trans->block_rsv = &fs_info->trans_block_rsv;
-
- num_bytes = trans->bytes_reserved;
- /*
- * 1 item for inode item insertion if need
- * 4 items for inode item update (in the worst case)
- * 1 items for slack space if we need do truncation
- * 1 item for free space object
- * 3 items for pre-allocation
- */
- trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- trans->bytes_reserved,
- BTRFS_RESERVE_NO_FLUSH);
- if (ret)
- goto out;
- trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
- trans->bytes_reserved, 1);
-again:
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
- ret = PTR_ERR(inode);
- goto out_release;
- }
-
- if (IS_ERR(inode)) {
- BUG_ON(retry); /* Logic error */
- retry = true;
-
- ret = create_free_ino_inode(root, trans, path);
- if (ret)
- goto out_release;
- goto again;
- }
-
- BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
-
- if (i_size_read(inode) > 0) {
- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
- if (ret) {
- if (ret != -ENOSPC)
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
- }
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
- ret = -1;
- spin_unlock(&root->ino_cache_lock);
- goto out_put;
- }
- spin_unlock(&root->ino_cache_lock);
-
- spin_lock(&ctl->tree_lock);
- prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
- prealloc = ALIGN(prealloc, PAGE_SIZE);
- prealloc += ctl->total_bitmaps * PAGE_SIZE;
- spin_unlock(&ctl->tree_lock);
-
- /* Just to make sure we have enough space */
- prealloc += 8 * PAGE_SIZE;
-
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
- if (ret)
- goto out_put;
-
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
- prealloc, prealloc, &alloc_hint);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
- goto out_put;
- }
-
- ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
-out_put:
- iput(inode);
-out_release:
- trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
- trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
-out:
- trans->block_rsv = rsv;
- trans->bytes_reserved = num_bytes;
-
- btrfs_free_path(path);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
-{
- struct btrfs_path *path;
- int ret;
- struct extent_buffer *l;
- struct btrfs_key search_key;
- struct btrfs_key found_key;
- int slot;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
- search_key.type = -1;
- search_key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto error;
- BUG_ON(ret == 0); /* Corruption */
- if (path->slots[0] > 0) {
- slot = path->slots[0] - 1;
- l = path->nodes[0];
- btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = max_t(u64, found_key.objectid,
- BTRFS_FIRST_FREE_OBJECTID - 1);
- } else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
- }
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
-}
-
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
-{
- int ret;
- mutex_lock(&root->objectid_mutex);
-
- if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
- btrfs_warn(root->fs_info,
- "the objectid of root %llu reaches its highest value",
- root->root_key.objectid);
- ret = -ENOSPC;
- goto out;
- }
-
- *objectid = ++root->highest_objectid;
- ret = 0;
-out:
- mutex_unlock(&root->objectid_mutex);
- return ret;
-}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
deleted file mode 100644
index 7a962811dffe..000000000000
--- a/fs/btrfs/inode-map.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef BTRFS_INODE_MAP_H
-#define BTRFS_INODE_MAP_H
-
-void btrfs_init_free_ino_ctl(struct btrfs_root *root);
-void btrfs_unpin_free_ino(struct btrfs_root *root);
-void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
-int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
-int btrfs_save_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans);
-
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
-
-#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d267eb5caa7b..0e516aefbf51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3,9 +3,10 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
+#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
+#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -28,8 +29,11 @@
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
+#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -43,22 +47,56 @@
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
-#include "inode-map.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "space-info.h"
+#include "zoned.h"
+#include "subpage.h"
+#include "inode-item.h"
struct btrfs_iget_args {
- struct btrfs_key *location;
+ u64 ino;
struct btrfs_root *root;
};
struct btrfs_dio_data {
- u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool data_space_reserved;
+ bool nocow_done;
+};
+
+struct btrfs_dio_private {
+ struct inode *inode;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
+ /* Used for bio::bi_size */
+ u32 bytes;
+
+ /*
+ * References to this structure. There is one reference per in-flight
+ * bio plus one while we're still setting up.
+ */
+ refcount_t refs;
+
+ /* Array of checksums */
+ u8 *csums;
+
+ /* This must be last */
+ struct bio bio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -67,7 +105,6 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -77,20 +114,66 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock);
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset);
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate);
+/*
+ * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ *
+ * ilock_flags can have the following bit set:
+ *
+ * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
+ * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
+ * return -EAGAIN
+ * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
+ */
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_SHARED) {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock_shared(inode);
+ } else {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock(inode);
+ }
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
+ return 0;
+}
+
+/*
+ * btrfs_inode_unlock - unock inode i_rwsem
+ *
+ * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
+ * to decide whether the lock acquired is shared or exclusive.
+ */
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ if (ilock_flags & BTRFS_ILOCK_SHARED)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+}
/*
* Cleanup all submitted ordered extents in specified range to handle errors
@@ -102,58 +185,92 @@ static void __endio_write_update_ordered(struct inode *inode,
* to be released, which we want to happen only when finishing the ordered
* extent (btrfs_finish_ordered_io()).
*/
-static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *locked_page,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = page_offset(locked_page);
- u64 page_end = page_start + PAGE_SIZE - 1;
-
+ u64 page_start, page_end;
struct page *page;
+ if (locked_page) {
+ page_start = page_offset(locked_page);
+ page_end = page_start + PAGE_SIZE - 1;
+ }
+
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ /*
+ * For locked page, we will call end_extent_writepage() on it
+ * in run_delalloc_range() for the error handling. That
+ * end_extent_writepage() function will call
+ * btrfs_mark_ordered_io_finished() to clear page Ordered and
+ * run the ordered extent accounting.
+ *
+ * Here we can't just clear the Ordered bit, or
+ * btrfs_mark_ordered_io_finished() would skip the accounting
+ * for the page range, and the ordered extent will never finish.
+ */
+ if (locked_page && index == (page_start >> PAGE_SHIFT)) {
+ index++;
+ continue;
+ }
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
- ClearPagePrivate2(page);
+
+ /*
+ * Here we just clear all Ordered bits for every page in the
+ * range, then btrfs_mark_ordered_io_finished() will handle
+ * the ordered extent accounting for the range.
+ */
+ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
+ offset, bytes);
put_page(page);
}
- /*
- * In case this page belongs to the delalloc range being instantiated
- * then skip it, since the first page of a range is going to be
- * properly cleaned up by the caller of run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- offset += PAGE_SIZE;
- bytes -= PAGE_SIZE;
+ if (locked_page) {
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_start + PAGE_SIZE)
+ return;
+ /*
+ * In case this page belongs to the delalloc range being
+ * instantiated then skip it, since the first page of a range is
+ * going to be properly cleaned up by the caller of
+ * run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
+ }
}
- return __endio_write_update_ordered(inode, offset, bytes, false);
+ return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+ struct btrfs_new_inode_args *args)
{
int err;
- err = btrfs_init_acl(trans, inode, dir);
- if (!err)
- err = btrfs_xattr_security_init(trans, inode, dir, qstr);
- return err;
+ if (args->default_acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ACL_TYPE_DEFAULT);
+ if (err)
+ return err;
+ }
+ if (args->acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (err)
+ return err;
+ }
+ if (!args->default_acl && !args->acl)
+ cache_no_acl(args->inode);
+ return btrfs_xattr_security_init(trans, args->inode, args->dir,
+ &args->dentry->d_name);
}
/*
@@ -162,12 +279,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, int extent_inserted,
- struct btrfs_root *root, struct inode *inode,
- u64 start, size_t size, size_t compressed_size,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode, bool extent_inserted,
+ size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
+ struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
@@ -175,7 +294,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
- unsigned long offset;
+ u64 i_size;
ASSERT((compressed_size > 0 && compressed_pages) ||
(compressed_size == 0 && !compressed_pages));
@@ -183,18 +302,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
if (compressed_size && compressed_pages)
cur_size = compressed_size;
- inode_add_bytes(inode, size);
-
if (!extent_inserted) {
struct btrfs_key key;
size_t datasize;
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = start;
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (ret)
@@ -218,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = kmap_local_page(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
i++;
ptr += cur_size;
@@ -229,29 +345,38 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->i_mapping,
- start >> PAGE_SHIFT);
+ page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_atomic(page);
- offset = offset_in_page(start);
- write_extent_buffer(leaf, kaddr + offset, ptr, size);
- kunmap_atomic(kaddr);
+ kaddr = kmap_local_page(page);
+ write_extent_buffer(leaf, kaddr, ptr, size);
+ kunmap_local(kaddr);
put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
/*
- * we're an inline extent, so nobody can
- * extend the file past i_size without locking
- * a page we already have locked.
+ * We align size to sectorsize for inline extents just for simplicity
+ * sake.
+ */
+ ret = btrfs_inode_set_file_extent_range(inode, 0,
+ ALIGN(size, root->fs_info->sectorsize));
+ if (ret)
+ goto fail;
+
+ /*
+ * We're an inline extent, so nobody can extend the file past i_size
+ * without locking a page we already have locked.
*
- * We must do any isize and inode updates
- * before we unlock the pages. Otherwise we
- * could end up racing with unlink.
+ * We must do any i_size and inode updates before we unlock the pages.
+ * Otherwise we could end up racing with unlink.
*/
- BTRFS_I(inode)->disk_i_size = inode->i_size;
- ret = btrfs_update_inode(trans, root, inode);
+ i_size = i_size_read(&inode->vfs_inode);
+ if (update_i_size && size > i_size) {
+ i_size_write(&inode->vfs_inode, size);
+ i_size = size;
+ }
+ inode->disk_i_size = i_size;
fail:
return ret;
@@ -263,36 +388,31 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct inode *inode, u64 start,
- u64 end, size_t compressed_size,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(inode);
- u64 actual_end = min(end + 1, isize);
- u64 inline_len = actual_end - start;
- u64 aligned_end = ALIGN(end, fs_info->sectorsize);
- u64 data_len = inline_len;
+ u64 data_len = (compressed_size ?: size);
int ret;
struct btrfs_path *path;
- int extent_inserted = 0;
- u32 extent_item_size;
-
- if (compressed_size)
- data_len = compressed_size;
- if (start > 0 ||
- actual_end > fs_info->sectorsize ||
+ /*
+ * We can create an inline extent if it ends at or beyond the current
+ * i_size, is no larger than a sector (decompressed), and the (possibly
+ * compressed) data fits in a leaf and the configured maximum inline
+ * size.
+ */
+ if (size < i_size_read(&inode->vfs_inode) ||
+ size > fs_info->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- (!compressed_size &&
- (actual_end & (fs_info->sectorsize - 1)) == 0) ||
- end + 1 < isize ||
- data_len > fs_info->max_inline) {
+ data_len > fs_info->max_inline)
return 1;
- }
path = btrfs_alloc_path();
if (!path)
@@ -303,29 +423,23 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
btrfs_free_path(path);
return PTR_ERR(trans);
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
-
- if (compressed_size && compressed_pages)
- extent_item_size = btrfs_file_extent_calc_inline_size(
- compressed_size);
- else
- extent_item_size = btrfs_file_extent_calc_inline_size(
- inline_len);
+ trans->block_rsv = &inode->block_rsv;
- ret = __btrfs_drop_extents(trans, root, inode, path,
- start, aligned_end, NULL,
- 1, 1, extent_item_size, &extent_inserted);
+ drop_args.path = path;
+ drop_args.start = 0;
+ drop_args.end = fs_info->sectorsize;
+ drop_args.drop_cache = true;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- if (isize > actual_end)
- inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, extent_inserted,
- root, inode, start,
- inline_len, compressed_size,
- compress_type, compressed_pages);
+ ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
+ size, compressed_size, compress_type,
+ compressed_pages, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -334,8 +448,17 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
+ btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ } else if (ret == -ENOSPC) {
+ ret = 1;
+ goto out;
+ }
+
+ btrfs_set_inode_full_sync(inode);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -364,15 +487,14 @@ struct async_chunk {
struct page *locked_page;
u64 start;
u64 end;
- unsigned int write_flags;
+ blk_opf_t write_flags;
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
- atomic_t *pending;
+ struct async_cow *async_cow;
};
struct async_cow {
- /* Number of chunks in flight; must be first in the structure */
atomic_t num_chunks;
struct async_chunk chunks[];
};
@@ -399,53 +521,75 @@ static noinline int add_async_extent(struct async_chunk *cow,
}
/*
- * Check if the inode has flags compatible with compression
- */
-static inline bool inode_can_compress(struct inode *inode)
-{
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
- BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
- return false;
- return true;
-}
-
-/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
-static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (!inode_can_compress(inode)) {
+ if (!btrfs_inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(BTRFS_I(inode)));
+ btrfs_ino(inode));
return 0;
}
+ /*
+ * Special check for subpage.
+ *
+ * We lock the full page then run each delalloc range in the page, thus
+ * for the following case, we will hit some subpage specific corner case:
+ *
+ * 0 32K 64K
+ * | |///////| |///////|
+ * \- A \- B
+ *
+ * In above case, both range A and range B will try to unlock the full
+ * page [0, 64K), causing the one finished later will have page
+ * unlocked already, triggering various page lock requirement BUG_ON()s.
+ *
+ * So here we add an artificial limit that subpage compression can only
+ * if the range is fully page aligned.
+ *
+ * In theory we only need to ensure the first page is fully covered, but
+ * the tailing partial page will be locked until the full compression
+ * finishes, delaying the write of other range.
+ *
+ * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+ * first to prevent any submitted async extent to unlock the full page.
+ * By this, we can ensure for subpage case that only the last async_cow
+ * will unlock the full page.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(end + 1))
+ return 0;
+ }
+
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
/* defrag ioctl */
- if (BTRFS_I(inode)->defrag_compress)
+ if (inode->defrag_compress)
return 1;
/* bad compression ratios */
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
if (btrfs_test_opt(fs_info, COMPRESS) ||
- BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
- BTRFS_I(inode)->prop_compress)
- return btrfs_compress_heuristic(inode, start, end);
+ inode->flags & BTRFS_INODE_COMPRESS ||
+ inode->prop_compress)
+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
return 0;
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
- u64 start, u64 end, u64 num_bytes, u64 small_write)
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode);
+ btrfs_add_inode_defrag(NULL, inode, small_write);
}
/*
@@ -504,7 +648,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
nr_pages = min_t(unsigned long, nr_pages,
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
@@ -524,13 +667,24 @@ again:
total_compressed = actual_end - start;
/*
- * skip compression for a small file range(<=blocksize) that
+ * Skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
goto cleanup_and_bail_uncompressed;
+ /*
+ * For subpage case, we require full page alignment for the sector
+ * aligned range.
+ * Thus we must also check against @actual_end, not just @end.
+ */
+ if (blocksize < PAGE_SIZE) {
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(round_up(actual_end, blocksize)))
+ goto cleanup_and_bail_uncompressed;
+ }
+
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -541,7 +695,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(inode, start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -584,34 +738,35 @@ again:
if (!ret) {
unsigned long offset = offset_in_page(total_compressed);
struct page *page = pages[nr_pages - 1];
- char *kaddr;
/* zero the tail end of the last page, we might be
* sending it down to disk
*/
- if (offset) {
- kaddr = kmap_atomic(page);
- memset(kaddr + offset, 0,
- PAGE_SIZE - offset);
- kunmap_atomic(kaddr);
- }
+ if (offset)
+ memzero_page(page, offset, PAGE_SIZE - offset);
will_compress = 1;
}
}
cont:
- if (start == 0) {
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
+ 0, BTRFS_COMPRESS_NONE,
+ NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(inode, start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
total_compressed,
- compress_type, pages);
+ compress_type, pages,
+ false);
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
@@ -631,20 +786,26 @@ cont:
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
+ NULL,
clear_flags,
PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
}
- kfree(pages);
-
return 0;
}
}
@@ -662,7 +823,7 @@ cont:
* win, compare the page count read with the blocks on disk,
* compression must free at least one sector size
*/
- total_in = ALIGN(total_in, PAGE_SIZE);
+ total_in = round_up(total_in, fs_info->sectorsize);
if (total_compressed + blocksize <= total_in) {
compressed_extents++;
@@ -743,185 +904,207 @@ static void free_async_extent_pages(struct async_extent *async_extent)
async_extent->pages = NULL;
}
-/*
- * phase two of compressed writeback. This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued. We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+ struct async_extent *async_extent,
+ struct page *locked_page)
{
- struct inode *inode = async_chunk->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct async_extent *async_extent;
- u64 alloc_hint = 0;
- struct btrfs_key ins;
- struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
+ unsigned long nr_written = 0;
+ int page_started = 0;
+ int ret;
-again:
- while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
- list_del(&async_extent->list);
+ /*
+ * Call cow_file_range() to run the delalloc range directly, since we
+ * won't go to NOCOW or async path again.
+ *
+ * Also we call cow_file_range() with @unlock_page == 0, so that we
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ &nr_written, 0, NULL);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+ goto out;
+ }
+ if (ret < 0) {
+ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ if (locked_page) {
+ const u64 page_start = page_offset(locked_page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
-retry:
- lock_extent(io_tree, async_extent->start,
- async_extent->start + async_extent->ram_size - 1);
- /* did the compression code fall back to uncompressed IO? */
- if (!async_extent->pages) {
- int page_started = 0;
- unsigned long nr_written = 0;
+ btrfs_page_set_error(inode->root->fs_info, locked_page,
+ page_start, PAGE_SIZE);
+ set_page_writeback(locked_page);
+ end_page_writeback(locked_page);
+ end_extent_writepage(locked_page, ret, page_start, page_end);
+ unlock_page(locked_page);
+ }
+ goto out;
+ }
- /* allocate blocks */
- ret = cow_file_range(inode, async_chunk->locked_page,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ /* All pages will be unlocked, including @locked_page */
+out:
+ kfree(async_extent);
+ return ret;
+}
- /* JDM XXX */
+static int submit_one_async_extent(struct btrfs_inode *inode,
+ struct async_chunk *async_chunk,
+ struct async_extent *async_extent,
+ u64 *alloc_hint)
+{
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key ins;
+ struct page *locked_page = NULL;
+ struct extent_map *em;
+ int ret = 0;
+ u64 start = async_extent->start;
+ u64 end = async_extent->start + async_extent->ram_size - 1;
- /*
- * if page_started, cow_file_range inserted an
- * inline extent and took care of all the unlocking
- * and IO for us. Otherwise, we need to submit
- * all those pages down to the drive.
- */
- if (!page_started && !ret)
- extent_write_locked_range(inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- WB_SYNC_ALL);
- else if (ret && async_chunk->locked_page)
- unlock_page(async_chunk->locked_page);
- kfree(async_extent);
- cond_resched();
- continue;
- }
+ /*
+ * If async_chunk->locked_page is in the async_extent range, we need to
+ * handle it.
+ */
+ if (async_chunk->locked_page) {
+ u64 locked_page_start = page_offset(async_chunk->locked_page);
+ u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
- ret = btrfs_reserve_extent(root, async_extent->ram_size,
- async_extent->compressed_size,
- async_extent->compressed_size,
- 0, alloc_hint, &ins, 1, 1);
- if (ret) {
- free_async_extent_pages(async_extent);
-
- if (ret == -ENOSPC) {
- unlock_extent(io_tree, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
-
- /*
- * we need to redirty the pages if we decide to
- * fallback to uncompressed IO, otherwise we
- * will not submit these pages down to lower
- * layers.
- */
- extent_range_redirty_for_io(inode,
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1);
-
- goto retry;
- }
- goto out_free;
- }
- /*
- * here we're doing allocation and writeback of the
- * compressed pages
- */
- em = create_io_em(inode, async_extent->start,
- async_extent->ram_size, /* len */
- async_extent->start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
- if (IS_ERR(em))
- /* ret value is not necessary due to void function */
- goto out_free_reserve;
- free_extent_map(em);
+ if (!(start >= locked_page_end || end <= locked_page_start))
+ locked_page = async_chunk->locked_page;
+ }
+ lock_extent(io_tree, start, end, NULL);
- ret = btrfs_add_ordered_extent_compress(inode,
- async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- BTRFS_ORDERED_COMPRESSED,
- async_extent->compress_type);
- if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1, 0);
- goto out_free_reserve;
- }
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ /* We have fall back to uncompressed write */
+ if (!async_extent->pages)
+ return submit_uncompressed_range(inode, async_extent, locked_page);
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ async_extent->compressed_size,
+ async_extent->compressed_size,
+ 0, *alloc_hint, &ins, 1, 1);
+ if (ret) {
+ free_async_extent_pages(async_extent);
/*
- * clear dirty, set writeback and unlock the pages.
+ * Here we used to try again by going back to non-compressed
+ * path for ENOSPC. But we can't reserve space even for
+ * compressed size, how could it work for uncompressed size
+ * which requires larger size? So here we directly go error
+ * path.
*/
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
- NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK);
- if (btrfs_submit_compressed_write(inode,
- async_extent->start,
- async_extent->ram_size,
- ins.objectid,
- ins.offset, async_extent->pages,
- async_extent->nr_pages,
- async_chunk->write_flags,
- async_chunk->blkcg_css)) {
- struct page *p = async_extent->pages[0];
- const u64 start = async_extent->start;
- const u64 end = start + async_extent->ram_size - 1;
-
- p->mapping = inode->i_mapping;
- btrfs_writepage_endio_finish_ordered(p, start, end, 0);
-
- p->mapping = NULL;
- extent_clear_unlock_delalloc(inode, start, end,
- NULL, 0,
- PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
- free_async_extent_pages(async_extent);
- }
- alloc_hint = ins.objectid + ins.offset;
- kfree(async_extent);
- cond_resched();
+ goto out_free;
+ }
+
+ /* Here we're doing allocation and writeback of the compressed pages */
+ em = create_io_em(inode, start,
+ async_extent->ram_size, /* len */
+ start, /* orig_start */
+ ins.objectid, /* block_start */
+ ins.offset, /* block_len */
+ ins.offset, /* orig_block_len */
+ async_extent->ram_size, /* ram_bytes */
+ async_extent->compress_type,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserve;
}
- return;
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ async_extent->ram_size, /* ram_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* disk_num_bytes */
+ 0, /* offset */
+ 1 << BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
+ if (ret) {
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ goto out_free_reserve;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ /* Clear dirty, set writeback and unlock the pages. */
+ extent_clear_unlock_delalloc(inode, start, end,
+ NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
+ if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* compressed_len */
+ async_extent->pages, /* compressed_pages */
+ async_extent->nr_pages,
+ async_chunk->write_flags,
+ async_chunk->blkcg_css, true)) {
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
+ *alloc_hint = ins.objectid + ins.offset;
+ kfree(async_extent);
+ return ret;
+
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
- extent_clear_unlock_delalloc(inode, async_extent->start,
- async_extent->start +
- async_extent->ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
- goto again;
+ return ret;
+}
+
+/*
+ * Phase two of compressed writeback. This is the ordered portion of the code,
+ * which only gets called in the order the work was queued. We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct async_extent *async_extent;
+ u64 alloc_hint = 0;
+ int ret = 0;
+
+ while (!list_empty(&async_chunk->extents)) {
+ u64 extent_start;
+ u64 ram_size;
+
+ async_extent = list_entry(async_chunk->extents.next,
+ struct async_extent, list);
+ list_del(&async_extent->list);
+ extent_start = async_extent->start;
+ ram_size = async_extent->ram_size;
+
+ ret = submit_one_async_extent(inode, async_chunk, async_extent,
+ &alloc_hint);
+ btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), extent_start, ram_size, ret);
+ }
}
-static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 num_bytes)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;
@@ -962,18 +1145,43 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
* *page_started is set to one if we unlock locked_page and do everything
* required to start IO on it. It may be clean and already done with
* IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ * - All the pages are unlocked. IO is started.
+ * - Note that this can happen only on success
+ * - unlock == 1
+ * - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ * - On success, all the pages are locked for writing out them
+ * - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
*/
-static noinline int cow_file_range(struct inode *inode,
+static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock)
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
+ u64 orig_start = start;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -982,8 +1190,7 @@ static noinline int cow_file_range(struct inode *inode,
bool extent_reserved = false;
int ret = 0;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
- WARN_ON_ONCE(1);
+ if (btrfs_is_free_space_inode(inode)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -992,12 +1199,25 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = max(blocksize, num_bytes);
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
- inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
+
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+ end + 1);
- if (start == 0) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1005,15 +1225,28 @@ static noinline int cow_file_range(struct inode *inode,
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(inode, start, end,
+ locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK);
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
+ /*
+ * locked_page is locked by the caller of
+ * writepage_delalloc(), not locked by
+ * __process_pages_contig().
+ *
+ * We can't let __process_pages_contig() to unlock it,
+ * as it doesn't have any subpage::writers recorded.
+ *
+ * Here we manually unlock the page, since the caller
+ * can't use page_started to determine if it's an
+ * inline extent or a compressed extent.
+ */
+ unlock_page(locked_page);
goto out;
} else if (ret < 0) {
goto out_unlock;
@@ -1021,13 +1254,27 @@ static noinline int cow_file_range(struct inode *inode,
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + num_bytes - 1, 0);
+
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (btrfs_is_data_reloc_root(root))
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1049,13 +1296,14 @@ static noinline int cow_file_range(struct inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size, 0);
+ ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
+ ins.objectid, cur_alloc_size, 0,
+ 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto out_drop_extent_cache;
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
/*
@@ -1070,24 +1318,25 @@ static noinline int cow_file_range(struct inode *inode,
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + ram_size - 1,
+ false);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /* we're not doing compressed IO, don't unlock the first
- * page (which the caller expects to stay locked), don't
- * clear any dirty bits and don't set any writeback bits
+ /*
+ * We're not doing compressed IO, don't unlock the first page
+ * (which the caller expects to stay locked), don't clear any
+ * dirty bits and don't set any writeback bits
*
- * Do set the Private2 bit so we know this page was properly
- * setup for writepage
+ * Do set the Ordered (Private2) bit so we know this page was
+ * properly setup for writepage.
*/
page_ops = unlock ? PAGE_UNLOCK : 0;
- page_ops |= PAGE_SET_PRIVATE2;
+ page_ops |= PAGE_SET_ORDERED;
- extent_clear_unlock_delalloc(inode, start,
- start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1111,39 +1360,89 @@ out:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ /*
+ * If done_offset is non-NULL and ret == -EAGAIN, we expect the
+ * caller to write out the successfully allocated region and retry.
+ */
+ if (done_offset && ret == -EAGAIN) {
+ if (orig_start < start)
+ *done_offset = start - 1;
+ else
+ *done_offset = start;
+ return ret;
+ } else if (ret == -EAGAIN) {
+ /* Convert to -ENOSPC since the caller cannot retry. */
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Now, we have three regions to clean up:
+ *
+ * |-------(1)----|---(2)---|-------------(3)----------|
+ * `- orig_start `- start `- start + cur_alloc_size `- end
+ *
+ * We process each region below.
+ */
+
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK;
- /*
- * If we reserved an extent for our delalloc range (or a subrange) and
- * failed to create the respective ordered extent, then it means that
- * when we reserved the extent we decremented the extent's size from
- * the data space_info's bytes_may_use counter and incremented the
- * space_info's bytes_reserved counter by the same amount. We must make
- * sure extent_clear_unlock_delalloc() does not try to decrement again
- * the data space_info's bytes_may_use counter, therefore we do not pass
- * it the flag EXTENT_CLEAR_DATA_RESV.
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ /*
+ * For the range (1). We have already instantiated the ordered extents
+ * for this region. They are cleaned up by
+ * btrfs_cleanup_ordered_extents() in e.g,
+ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+ * function.
+ *
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+ if (!unlock && orig_start < start) {
+ if (!locked_page)
+ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
+ }
+
+ /*
+ * For the range (2). If we reserved an extent for our delalloc range
+ * (or a subrange) and failed to create the respective ordered extent,
+ * then it means that when we reserved the extent we decremented the
+ * extent's size from the data space_info's bytes_may_use counter and
+ * incremented the space_info's bytes_reserved counter by the same
+ * amount. We must make sure extent_clear_unlock_delalloc() does not try
+ * to decrement again the data space_info's bytes_may_use counter,
+ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size,
+ start + cur_alloc_size - 1,
locked_page,
clear_bits,
page_ops);
start += cur_alloc_size;
if (start >= end)
- goto out;
+ return ret;
}
+
+ /*
+ * For the range (3). We never touched the region. In addition to the
+ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+ * space_info's bytes_may_use counter, reserved in
+ * btrfs_check_data_free_space().
+ */
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
- goto out;
+ return ret;
}
/*
@@ -1176,11 +1475,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1189,32 +1483,36 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
- /*
- * Since the pointer to 'pending' is at the beginning of the array of
- * async_chunk's, freeing it ensures the whole array has been freed.
- */
- if (atomic_dec_and_test(async_chunk->pending))
- kvfree(async_chunk->pending);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
}
-static int cow_file_range_async(struct inode *inode,
+static int cow_file_range_async(struct btrfs_inode *inode,
struct writeback_control *wbc,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
@@ -1224,11 +1522,11 @@ static int cow_file_range_async(struct inode *inode,
int i;
bool should_compress;
unsigned nofs_flag;
- const unsigned int write_flags = wbc_to_write_flags(wbc);
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
num_chunks = 1;
should_compress = false;
@@ -1244,9 +1542,8 @@ static int cow_file_range_async(struct inode *inode,
unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
- unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR;
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits, page_ops);
@@ -1266,9 +1563,9 @@ static int cow_file_range_async(struct inode *inode,
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
- ihold(inode);
- async_chunk[i].pending = &ctx->num_chunks;
- async_chunk[i].inode = inode;
+ ihold(&inode->vfs_inode);
+ async_chunk[i].async_cow = ctx;
+ async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
@@ -1323,15 +1620,62 @@ static int cow_file_range_async(struct inode *inode,
return 0;
}
-static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes)
+static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, int *page_started,
+ unsigned long *nr_written)
{
+ u64 done_offset = end;
int ret;
+ bool locked_page_done = false;
+
+ while (start <= end) {
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0, &done_offset);
+ if (ret && ret != -EAGAIN)
+ return ret;
+
+ if (*page_started) {
+ ASSERT(ret == 0);
+ return 0;
+ }
+
+ if (ret == 0)
+ done_offset = end;
+
+ if (done_offset == start) {
+ wait_on_bit_io(&inode->root->fs_info->flags,
+ BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ continue;
+ }
+
+ if (!locked_page_done) {
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ }
+ locked_page_done = true;
+ extent_write_locked_range(&inode->vfs_inode, start, done_offset);
+
+ start = done_offset + 1;
+ }
+
+ *page_started = 1;
+
+ return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 num_bytes, bool nowait)
+{
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
struct btrfs_ordered_sum *sums;
+ int ret;
LIST_HEAD(list);
- ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
- bytenr + num_bytes - 1, &list, 0);
+ ret = btrfs_lookup_csums_range(csum_root, bytenr,
+ bytenr + num_bytes - 1, &list, 0,
+ nowait);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1345,6 +1689,209 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
return 1;
}
+static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
+ const u64 start, const u64 end,
+ int *page_started, unsigned long *nr_written)
+{
+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
+ const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
+ const u64 range_bytes = end + 1 - start;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 range_start = start;
+ u64 count;
+
+ /*
+ * If EXTENT_NORESERVE is set it means that when the buffered write was
+ * made we had not enough available data space and therefore we did not
+ * reserve data space for it, since we though we could do NOCOW for the
+ * respective file range (either there is prealloc extent or the inode
+ * has the NOCOW bit set).
+ *
+ * However when we need to fallback to COW mode (because for example the
+ * block group for the corresponding extent was turned to RO mode by a
+ * scrub or relocation) we need to do the following:
+ *
+ * 1) We increment the bytes_may_use counter of the data space info.
+ * If COW succeeds, it allocates a new data extent and after doing
+ * that it decrements the space info's bytes_may_use counter and
+ * increments its bytes_reserved counter by the same amount (we do
+ * this at btrfs_add_reserved_bytes()). So we need to increment the
+ * bytes_may_use counter to compensate (when space is reserved at
+ * buffered write time, the bytes_may_use counter is incremented);
+ *
+ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
+ * that if the COW path fails for any reason, it decrements (through
+ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
+ * data space info, which we incremented in the step above.
+ *
+ * If we need to fallback to cow and the inode corresponds to a free
+ * space cache inode or an inode of the data relocation tree, we must
+ * also increment bytes_may_use of the data space_info for the same
+ * reason. Space caches and relocated data extents always get a prealloc
+ * extent for them, however scrub or balance may have set the block
+ * group that contains that extent to RO mode and therefore force COW
+ * when starting writeback.
+ */
+ count = count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0);
+ if (count > 0 || is_space_ino || is_reloc_ino) {
+ u64 bytes = count;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+
+ if (is_space_ino || is_reloc_ino)
+ bytes = range_bytes;
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ spin_unlock(&sinfo->lock);
+
+ if (count > 0)
+ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ NULL);
+ }
+
+ return cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 1, NULL);
+}
+
+struct can_nocow_file_extent_args {
+ /* Input fields. */
+
+ /* Start file offset of the range we want to NOCOW. */
+ u64 start;
+ /* End file offset (inclusive) of the range we want to NOCOW. */
+ u64 end;
+ bool writeback_path;
+ bool strict;
+ /*
+ * Free the path passed to can_nocow_file_extent() once it's not needed
+ * anymore.
+ */
+ bool free_path;
+
+ /* Output fields. Only set when can_nocow_file_extent() returns 1. */
+
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ /* Number of bytes that can be written to in NOCOW mode. */
+ u64 num_bytes;
+};
+
+/*
+ * Check if we can NOCOW the file extent that the path points to.
+ * This function may return with the path released, so the caller should check
+ * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
+ *
+ * Returns: < 0 on error
+ * 0 if we can not NOCOW
+ * 1 if we can NOCOW
+ */
+static int can_nocow_file_extent(struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ struct can_nocow_file_extent_args *args)
+{
+ const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 extent_type;
+ int can_nocow = 0;
+ int ret = 0;
+ bool nowait = path->nowait;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ goto out;
+
+ /* Can't access these fields unless we know it's not an inline extent. */
+ args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ extent_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
+ /*
+ * If the extent was created before the generation where the last snapshot
+ * for its subvolume was created, then this implies the extent is shared,
+ * hence we must COW.
+ */
+ if (!args->strict &&
+ btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ /* An explicit hole, must COW. */
+ if (args->disk_bytenr == 0)
+ goto out;
+
+ /* Compressed/encrypted/encoded extents must be COWed. */
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The following checks can be expensive, as they need to take other
+ * locks and do btree or rbtree searches, so release the path to avoid
+ * blocking other tasks for too long.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
+ key->offset - args->extent_offset,
+ args->disk_bytenr, false, path);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ if (args->free_path) {
+ /*
+ * We don't need the path anymore, plus through the
+ * csum_exist_in_range() call below we will end up allocating
+ * another path. So free the path to avoid unnecessary extra
+ * memory usage.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+ }
+
+ /* If there are pending snapshots for this root, we must COW. */
+ if (args->writeback_path && !is_freespace_inode &&
+ atomic_read(&root->snapshot_force_cow))
+ goto out;
+
+ args->disk_bytenr += args->extent_offset;
+ args->disk_bytenr += args->start - key->offset;
+ args->num_bytes = min(args->end + 1, extent_end) - args->start;
+
+ /*
+ * Force COW if csums exist in the range. This ensures that csums for a
+ * given extent are either valid or do not exist.
+ */
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
+ nowait);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ can_nocow = 1;
+ out:
+ if (args->free_path && path)
+ btrfs_free_path(path);
+
+ return ret < 0 ? ret : can_nocow;
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -1352,23 +1899,23 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static noinline int run_delalloc_nocow(struct inode *inode,
+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
- int *page_started, int force,
+ int *page_started,
unsigned long *nr_written)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ u64 ino = btrfs_ino(inode);
+ struct btrfs_block_group *bg;
bool nocow = false;
- u64 disk_bytenr = 0;
+ struct can_nocow_file_extent_args nocow_args = { 0 };
path = btrfs_alloc_path();
if (!path) {
@@ -1376,21 +1923,21 @@ static noinline int run_delalloc_nocow(struct inode *inode,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
return -ENOMEM;
}
+ nocow_args.end = end;
+ nocow_args.writeback_path = true;
+
while (1) {
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
u64 extent_end;
- u64 extent_offset;
- u64 num_bytes = 0;
- u64 disk_num_bytes;
u64 ram_bytes;
+ u64 nocow_end;
int extent_type;
nocow = false;
@@ -1466,109 +2013,38 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
-
+ /* If this is triggered then we have a memory corruption. */
+ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
+ if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
+ ret = -EUCLEAN;
+ goto error;
+ }
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- if (extent_type == BTRFS_FILE_EXTENT_REG ||
- extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- extent_offset = btrfs_file_extent_offset(leaf, fi);
- extent_end = found_key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- disk_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf, fi);
- /*
- * If the extent we got ends before our current offset,
- * skip to the next extent.
- */
- if (extent_end <= cur_offset) {
- path->slots[0]++;
- goto next_slot;
- }
- /* Skip holes */
- if (disk_bytenr == 0)
- goto out_check;
- /* Skip compressed/encrypted/encoded extents */
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out_check;
- /*
- * If extent is created before the last volume's snapshot
- * this implies the extent is shared, hence we can't do
- * nocow. This is the same check as in
- * btrfs_cross_ref_exist but without calling
- * btrfs_search_slot.
- */
- if (!freespace_inode &&
- btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
- goto out_check;
- if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
- goto out_check;
- /* If extent is RO, we must COW it */
- if (btrfs_extent_readonly(fs_info, disk_bytenr))
- goto out_check;
- ret = btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
+ extent_end = btrfs_file_extent_end(path);
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- disk_bytenr += extent_offset;
- disk_bytenr += cur_offset - found_key.offset;
- num_bytes = min(end + 1, extent_end) - cur_offset;
- /*
- * If there are pending snapshots for this root, we
- * fall into common COW way
- */
- if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
- goto out_check;
- /*
- * force cow if csum exists in the range.
- * this ensure that csum for a given extent are
- * either valid or do not exist.
- */
- ret = csum_exist_in_range(fs_info, disk_bytenr,
- num_bytes);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
- goto out_check;
- nocow = true;
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- extent_end = found_key.offset + ram_bytes;
- extent_end = ALIGN(extent_end, fs_info->sectorsize);
- /* Skip extents outside of our requested range */
- if (extent_end <= start) {
- path->slots[0]++;
- goto next_slot;
- }
- } else {
- /* If this triggers then we have a memory corruption */
- BUG();
+ /*
+ * If the extent we got ends before our current offset, skip to
+ * the next extent.
+ */
+ if (extent_end <= cur_offset) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+
+ nocow_args.start = cur_offset;
+ ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ } else if (ret == 0) {
+ goto out_check;
}
+
+ ret = 0;
+ bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ if (bg)
+ nocow = true;
out_check:
/*
* If nocow is false then record the beginning of the range
@@ -1580,89 +2056,88 @@ out_check:
cur_offset = extent_end;
if (cur_offset > end)
break;
+ if (!path->nodes[0])
+ continue;
path->slots[0]++;
goto next_slot;
}
- btrfs_release_path(path);
-
/*
* COW range from cow_start to found_key.offset - 1. As the key
* will contain the beginning of the first extent that can be
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page,
- cow_start, found_key.offset - 1,
- page_started, nr_written, 1);
- if (ret) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
+ ret = fallback_to_cow(inode, locked_page,
+ cow_start, found_key.offset - 1,
+ page_started, nr_written);
+ if (ret)
goto error;
- }
cow_start = (u64)-1;
}
+ nocow_end = cur_offset + nocow_args.num_bytes - 1;
+
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 orig_start = found_key.offset - extent_offset;
+ u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
- em = create_io_em(inode, cur_offset, num_bytes,
+ em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
orig_start,
- disk_bytenr, /* block_start */
- num_bytes, /* block_len */
- disk_num_bytes, /* orig_block_len */
+ nocow_args.disk_bytenr, /* block_start */
+ nocow_args.num_bytes, /* block_len */
+ nocow_args.disk_num_bytes, /* orig_block_len */
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
ret = PTR_ERR(em);
goto error;
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, cur_offset,
- disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_PREALLOC);
+ ret = btrfs_add_ordered_extent(inode,
+ cur_offset, nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes, 0,
+ 1 << BTRFS_ORDERED_PREALLOC,
+ BTRFS_COMPRESS_NONE);
if (ret) {
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
- cur_offset + num_bytes - 1,
- 0);
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ nocow_end, false);
goto error;
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
- disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_NOCOW);
+ nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes,
+ 0,
+ 1 << BTRFS_ORDERED_NOCOW,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto error;
}
- if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
- nocow = false;
+ if (nocow) {
+ btrfs_dec_nocow_writers(bg);
+ nocow = false;
+ }
- if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler
* from freeing metadata of created ordered extent.
*/
ret = btrfs_reloc_clone_csums(inode, cur_offset,
- num_bytes);
+ nocow_args.num_bytes);
- extent_clear_unlock_delalloc(inode, cur_offset,
- cur_offset + num_bytes - 1,
+ extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
cur_offset = extent_end;
@@ -1683,75 +2158,82 @@ out_check:
if (cow_start != (u64)-1) {
cur_offset = end;
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = fallback_to_cow(inode, locked_page, cow_start, end,
+ page_started, nr_written);
if (ret)
goto error;
}
error:
if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
+ btrfs_dec_nocow_writers(bg);
if (ret && cur_offset < end)
extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
btrfs_free_path(path);
return ret;
}
-static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
-
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
- return 0;
-
- /*
- * @defrag_bytes is a hint value, no spinlock held here,
- * if is not zero, it means the file is defragging.
- * Force cow if given extent needs to be defragged.
- */
- if (BTRFS_I(inode)->defrag_bytes &&
- test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_DEFRAG, 0, NULL))
- return 1;
-
- return 0;
+ if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
+ 0, NULL))
+ return false;
+ return true;
+ }
+ return false;
}
/*
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
int ret;
- int force_cow = need_force_cow(inode, start, end);
+ const bool zoned = btrfs_is_zoned(inode->root->fs_info);
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
- ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, 1, nr_written);
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ /*
+ * The range must cover part of the @locked_page, or the returned
+ * @page_started can confuse the caller.
+ */
+ ASSERT(!(end <= page_offset(locked_page) ||
+ start >= page_offset(locked_page) + PAGE_SIZE));
+
+ if (should_nocow(inode, start, end)) {
+ /*
+ * Normally on a zoned device we're only doing COW writes, but
+ * in case of relocation on a zoned filesystem we have taken
+ * precaution, that we're only writing sequentially. It's safe
+ * to use run_delalloc_nocow() here, like for regular
+ * preallocated inodes.
+ */
+ ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, 0, nr_written);
- } else if (!inode_can_compress(inode) ||
+ page_started, nr_written);
+ } else if (!btrfs_inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ if (zoned)
+ ret = run_delalloc_zoned(inode, locked_page, start, end,
+ page_started, nr_written);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1, NULL);
} else {
- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
+ ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
@@ -1761,6 +2243,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 size;
/* not delalloc, ignore it */
@@ -1768,7 +2251,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
return;
size = orig->end - orig->start + 1;
- if (size > BTRFS_MAX_EXTENT_SIZE) {
+ if (size > fs_info->max_extent_size) {
u32 num_extents;
u64 new_size;
@@ -1777,10 +2260,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
- num_extents = count_max_extents(new_size);
+ num_extents = count_max_extents(fs_info, new_size);
new_size = split - orig->start;
- num_extents += count_max_extents(new_size);
- if (count_max_extents(size) >= num_extents)
+ num_extents += count_max_extents(fs_info, new_size);
+ if (count_max_extents(fs_info, size) >= num_extents)
return;
}
@@ -1797,6 +2280,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size, old_size;
u32 num_extents;
@@ -1810,7 +2294,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
new_size = other->end - new->start + 1;
/* we're not bigger than the max, unreserve the space and go */
- if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ if (new_size <= fs_info->max_extent_size) {
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1836,10 +2320,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
* this case.
*/
old_size = other->end - other->start + 1;
- num_extents = count_max_extents(old_size);
+ num_extents = count_max_extents(fs_info, old_size);
old_size = new->end - new->start + 1;
- num_extents += count_max_extents(old_size);
- if (count_max_extents(new_size) >= num_extents)
+ num_extents += count_max_extents(fs_info, old_size);
+ if (count_max_extents(fs_info, new_size) >= num_extents)
return;
spin_lock(&BTRFS_I(inode)->lock);
@@ -1904,21 +2388,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* list of inodes that have pending delalloc work to be done.
*/
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits)
+ u32 bits)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
spin_lock(&BTRFS_I(inode)->lock);
@@ -1933,7 +2417,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- if (*bits & EXTENT_DEFRAG)
+ if (bits & EXTENT_DEFRAG)
BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
@@ -1942,7 +2426,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
state->start;
@@ -1955,14 +2439,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* accounting happens.
*/
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
- struct extent_state *state, unsigned *bits)
+ struct extent_state *state, u32 bits)
{
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
- if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+ if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
spin_unlock(&inode->lock);
@@ -1973,7 +2457,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
@@ -1986,7 +2470,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* don't need to call delalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_CLEAR_META_RESV &&
+ if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len, false);
@@ -1994,12 +2478,10 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
- (*bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(
- &inode->vfs_inode,
- state->start, len);
+ (bits & EXTENT_CLEAR_DATA_RESV))
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2013,55 +2495,17 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
+ if (bits & EXTENT_ADD_INODE_BYTES)
+ inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
}
/*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
- int ret;
- struct btrfs_io_geometry geom;
-
- if (bio_flags & EXTENT_BIO_COMPRESSED)
- return 0;
-
- length = bio->bi_iter.bi_size;
- map_length = length;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
- &geom);
- if (ret < 0)
- return ret;
-
- if (geom.len < length + size)
- return 1;
- return 0;
-}
-
-/*
* in order to insert checksums into the metadata in large chunks,
* we wait until bio submission time. All the pages in the bio are
* checksummed and sums are attached onto the ordered extent record.
@@ -2069,107 +2513,275 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
- struct inode *inode = private_data;
- blk_status_t ret = 0;
-
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
}
/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read: sync submit
- *
- * b) write without checksum: sync submit
+ * Split an extent_map at [start, start + len]
*
- * c) write with checksum:
- * c-1) if bio is issued by fsync: sync submit
- * (sync_writers != 0)
- *
- * c-2) if root is reloc root: sync submit
- * (only in case of buffered IO)
- *
- * c-3) otherwise: async submit
+ * This function is intended to be used only for extract_ordered_extent().
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ u64 pre, u64 post)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ struct extent_map *split_pre = NULL;
+ struct extent_map *split_mid = NULL;
+ struct extent_map *split_post = NULL;
+ int ret = 0;
+ unsigned long flags;
+
+ /* Sanity check */
+ if (pre == 0 && post == 0)
+ return 0;
+
+ split_pre = alloc_extent_map();
+ if (pre)
+ split_mid = alloc_extent_map();
+ if (post)
+ split_post = alloc_extent_map();
+ if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ASSERT(pre + post < len);
+
+ lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ASSERT(em->len == len);
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* First, replace the em with a new extent_map starting from * em->start */
+ split_pre->start = em->start;
+ split_pre->len = (pre ? pre : em->len - post);
+ split_pre->orig_start = split_pre->start;
+ split_pre->block_start = em->block_start;
+ split_pre->block_len = split_pre->len;
+ split_pre->orig_block_len = split_pre->block_len;
+ split_pre->ram_bytes = split_pre->len;
+ split_pre->flags = flags;
+ split_pre->compress_type = em->compress_type;
+ split_pre->generation = em->generation;
+
+ replace_extent_mapping(em_tree, em, split_pre, 1);
+
+ /*
+ * Now we only have an extent_map at:
+ * [em->start, em->start + pre] if pre != 0
+ * [em->start, em->start + em->len - post] if pre == 0
+ */
+
+ if (pre) {
+ /* Insert the middle extent_map */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre - post;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, 1);
+ }
+
+ if (post) {
+ split_post->start = em->start + em->len - post;
+ split_post->len = post;
+ split_post->orig_start = split_post->start;
+ split_post->block_start = em->block_start + em->len - post;
+ split_post->block_len = split_post->len;
+ split_post->orig_block_len = split_post->block_len;
+ split_post->ram_bytes = split_post->len;
+ split_post->flags = flags;
+ split_post->compress_type = em->compress_type;
+ split_post->generation = em->generation;
+ add_extent_mapping(em_tree, split_post, 1);
+ }
+
+ /* Once for us */
+ free_extent_map(em);
+ /* Once for the tree */
+ free_extent_map(em);
+
+out_unlock:
+ write_unlock(&em_tree->lock);
+ unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+out:
+ free_extent_map(split_pre);
+ free_extent_map(split_mid);
+ free_extent_map(split_post);
+
+ return ret;
+}
+static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ struct bio *bio, loff_t file_offset)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- blk_status_t ret = 0;
- int skip_sum;
- int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
+ struct btrfs_ordered_extent *ordered;
+ u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 file_len;
+ u64 len = bio->bi_iter.bi_size;
+ u64 end = start + len;
+ u64 ordered_end;
+ u64 pre, post;
+ int ret = 0;
- skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+ /* No need to split */
+ if (ordered->disk_num_bytes == len)
+ goto out;
- if (bio_op(bio) != REQ_OP_WRITE) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
- if (ret)
- goto out;
+ /* We cannot split once end_bio'd ordered extent */
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (bio_flags & EXTENT_BIO_COMPRESSED) {
- ret = btrfs_submit_compressed_read(inode, bio,
- mirror_num,
- bio_flags);
- goto out;
- } else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
- if (ret)
- goto out;
- }
- goto mapit;
- } else if (async && !skip_sum) {
- /* csum items have already been cloned */
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
- goto mapit;
- /* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
- 0, inode, btrfs_submit_bio_start);
+ /* We cannot split a compressed ordered extent */
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+ /* bio must be in one ordered extent */
+ if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Checksum list should be empty */
+ if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+ ret = -EINVAL;
goto out;
- } else if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
- if (ret)
- goto out;
}
-mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ file_len = ordered->num_bytes;
+ pre = start - ordered->disk_bytenr;
+ post = ordered_end - end;
+
+ ret = btrfs_split_ordered_extent(ordered, pre, post);
+ if (ret)
+ goto out;
+ ret = split_zoned_em(inode, file_offset, file_len, pre, post);
out:
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
+ }
+
+ /*
+ * If we need to checksum, and the I/O is not issued by fsync and
+ * friends, that is ->sync_writers != 0, defer the submission to a
+ * workqueue to parallelize it.
+ *
+ * Csum items for reloc roots have already been cloned at this point,
+ * so they are handled as part of the no-checksum case.
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(bi->root)) {
+ if (!atomic_read(&bi->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ btrfs_submit_bio_start))
+ return;
+
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
+ }
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+}
+
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
+
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
+ }
+
+ /* Save the original iter for read repair */
+ btrfs_bio(bio)->iter = bio->bi_iter;
+
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
}
- return ret;
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
+ struct btrfs_root *csum_root = NULL;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ if (!csum_root)
+ csum_root = btrfs_csum_root(trans->fs_info,
+ sum->bytenr);
+ ret = btrfs_csum_file_blocks(trans, csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2177,13 +2789,71 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, cached_state,
+ GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end));
- return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
- extra_bits, cached_state);
+
+ if (start >= i_size_read(&inode->vfs_inode) &&
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
+ /*
+ * There can't be any extents following eof in this case so just
+ * set the delalloc new bit for the range directly.
+ */
+ extra_bits |= EXTENT_DELALLOC_NEW;
+ } else {
+ int ret;
+
+ ret = btrfs_find_new_delalloc_bytes(inode, start,
+ end + 1 - start,
+ cached_state);
+ if (ret)
+ return ret;
+ }
+
+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
+ cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2200,7 +2870,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct page *page;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 page_start;
u64 page_end;
int ret = 0;
@@ -2208,7 +2878,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
- inode = fixup->inode;
+ inode = BTRFS_I(fixup->inode);
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
@@ -2245,8 +2915,7 @@ again:
* when the page was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
true);
@@ -2262,20 +2931,18 @@ again:
if (ret)
goto out_page;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state);
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PagePrivate2(page))
+ if (PageOrdered(page))
goto out_reserved;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
- PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
- page_end, &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2295,12 +2962,11 @@ again:
BUG_ON(!PageDirty(page));
free_delalloc_space = false;
out_reserved:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -2312,7 +2978,7 @@ out_page:
clear_page_dirty_for_io(page);
SetPageError(page);
}
- ClearPageChecked(page);
+ btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2322,7 +2988,7 @@ out_page:
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
- btrfs_add_delayed_iput(inode);
+ btrfs_add_delayed_iput(&inode->vfs_inode);
}
/*
@@ -2336,14 +3002,14 @@ out_page:
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
- /* this page is properly in the ordered list */
- if (TestClearPagePrivate2(page))
+ /* This page has ordered extent covering it already */
+ if (PageOrdered(page))
return 0;
/*
@@ -2367,7 +3033,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
* page->mapping outside of the page lock.
*/
ihold(inode);
- SetPageChecked(page);
+ btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -2378,19 +3044,22 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode, u64 file_pos,
- u64 disk_bytenr, u64 disk_num_bytes,
- u64 num_bytes, u64 ram_bytes,
- u8 compression, u8 encryption,
- u16 other_encoding, int extent_type)
+ struct btrfs_inode *inode, u64 file_pos,
+ struct btrfs_file_extent_item *stack_fi,
+ const bool update_inode_bytes,
+ u64 qgroup_reserved)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_root *root = inode->root;
+ const u64 sectorsize = root->fs_info->sectorsize;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
- u64 qg_released;
- int extent_inserted = 0;
+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 offset = btrfs_stack_file_extent_offset(stack_fi);
+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
path = btrfs_alloc_path();
@@ -2406,57 +3075,64 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
- file_pos + num_bytes, NULL, 0,
- 1, sizeof(*fi), &extent_inserted);
+ drop_args.path = path;
+ drop_args.start = file_pos;
+ drop_args.end = file_pos + num_bytes;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*stack_fi);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
- if (!extent_inserted) {
- ins.objectid = btrfs_ino(BTRFS_I(inode));
+ if (!drop_args.extent_inserted) {
+ ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
- sizeof(*fi));
+ sizeof(*stack_fi));
if (ret)
goto out;
}
leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_set_file_extent_type(leaf, fi, extent_type);
- btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, fi, 0);
- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
- btrfs_set_file_extent_compression(leaf, fi, compression);
- btrfs_set_file_extent_encryption(leaf, fi, encryption);
- btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
+ write_extent_buffer(leaf, stack_fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_file_extent_item));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_add_bytes(inode, num_bytes);
+ /*
+ * If we dropped an inline extent here, we know the range where it is
+ * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
+ * number of bytes only for that range containing the inline extent.
+ * The remaining of the range will be processed when clearning the
+ * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
+ */
+ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
+ u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
+
+ inline_size = drop_args.bytes_found - inline_size;
+ btrfs_update_inode_bytes(inode, sectorsize, inline_size);
+ drop_args.bytes_found -= inline_size;
+ num_bytes -= sectorsize;
+ }
+
+ if (update_inode_bytes)
+ btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
- /*
- * Release the reserved range from inode dirty range map, as it is
- * already moved into delayed_ref_head
- */
- ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
- if (ret < 0)
+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
+ if (ret)
goto out;
- qg_released = ret;
- ret = btrfs_alloc_reserved_file_extent(trans, root,
- btrfs_ino(BTRFS_I(inode)),
- file_pos, qg_released, &ins);
+
+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
+ file_pos - offset,
+ qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2478,17 +3154,56 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(cache);
}
-/* as ordered data IO finishes, this gets called so we can finish
+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *oe)
+{
+ struct btrfs_file_extent_item stack_fi;
+ bool update_inode_bytes;
+ u64 num_bytes = oe->num_bytes;
+ u64 ram_bytes = oe->ram_bytes;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
+ oe->disk_num_bytes);
+ btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ num_bytes = oe->truncated_len;
+ ram_bytes = num_bytes;
+ }
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ /*
+ * For delalloc, when completing an ordered extent we update the inode's
+ * bytes when clearing the range in the inode's io tree, so pass false
+ * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
+ * except if the ordered extent was truncated.
+ */
+ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
+
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ update_inode_bytes, oe->qgroup_rsv);
+}
+
+/*
+ * As ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
- struct inode *inode = ordered_extent->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
u64 start, end;
int compress_type = 0;
@@ -2496,27 +3211,35 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
u64 logical_len = ordered_extent->num_bytes;
bool freespace_inode;
bool truncated = false;
- bool range_locked = false;
- bool clear_new_delalloc_bytes = false;
bool clear_reserved_extent = true;
- unsigned int clear_bits;
+ unsigned int clear_bits = EXTENT_DEFRAG;
start = ordered_extent->file_offset;
end = start + ordered_extent->num_bytes - 1;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
- clear_new_delalloc_bytes = true;
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
+ clear_bits |= EXTENT_DELALLOC_NEW;
- freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(inode);
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
goto out;
}
- btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
+ /* A valid bdev implies a write on a sequential zone */
+ if (ordered_extent->bdev) {
+ btrfs_rewrite_logical_zoned(ordered_extent);
+ btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
+ }
+
+ btrfs_free_io_failure_record(inode, start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -2529,14 +3252,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
- /*
- * For mwrite(mmap + memset to write) case, we still reserve
- * space for NOCOW range.
- * As NOCOW won't cause a new delayed ref, just free the space
- */
- btrfs_qgroup_free_data(inode, NULL, start,
- ordered_extent->num_bytes);
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -2546,15 +3262,15 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out;
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
- range_locked = true;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ clear_bits |= EXTENT_LOCKED;
+ lock_extent(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -2566,26 +3282,21 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- btrfs_qgroup_free_data(inode, NULL, start,
- ordered_extent->num_bytes);
- ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
+ ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_reserved_file_extent(trans, inode, start,
- ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes,
- logical_len, logical_len,
- compress_type, 0, 0,
- BTRFS_FILE_EXTENT_REG);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2593,21 +3304,31 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset,
+ unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ /*
+ * If this is a new delalloc range, clear its new delalloc flag to
+ * update the inode's number of bytes. This needs to be done first
+ * before updating the inode item.
+ */
+ if ((clear_bits & EXTENT_DELALLOC_NEW) &&
+ !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
+ clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ &cached_state);
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
@@ -2615,13 +3336,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
ret = 0;
out:
- clear_bits = EXTENT_DEFRAG;
- if (range_locked)
- clear_bits |= EXTENT_LOCKED;
- if (clear_new_delalloc_bytes)
- clear_bits |= EXTENT_DELALLOC_NEW;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits,
&cached_state);
if (trans)
@@ -2630,12 +3345,24 @@ out:
if (ret || truncated) {
u64 unwritten_start = start;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
+ /* Drop extent maps for the part of the extent we didn't write. */
+ btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -2680,103 +3407,150 @@ out:
return ret;
}
-static void finish_ordered_fn(struct btrfs_work *work)
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate)
{
- struct btrfs_ordered_extent *ordered_extent;
- ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
- btrfs_finish_ordered_io(ordered_extent);
+ trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
+
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
}
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
- u64 end, int uptodate)
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *wq;
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
- trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
+ ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
- ClearPagePrivate2(page);
- if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1, uptodate))
- return;
+ shash->tfm = fs_info->csum_shash;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
+ kaddr = kmap_local_page(page) + pgoff;
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ kunmap_local(kaddr);
+
+ if (memcmp(csum, csum_expected, fs_info->csum_size))
+ return -EIO;
+ return 0;
+}
+
+static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered_extent->work);
+ return csums + offset_in_sectors * fs_info->csum_size;
}
-static int __readpage_endio_check(struct inode *inode,
- struct btrfs_io_bio *io_bio,
- int icsum, struct page *page,
- int pgoff, u64 start, size_t len)
+/*
+ * check_data_csum - verify checksum of one sector of uncompressed data
+ * @inode: inode
+ * @bbio: btrfs_bio which contains the csum
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @page: page where is the data to be verified
+ * @pgoff: offset inside the page
+ *
+ * The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
+ */
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ u32 len = fs_info->sectorsize;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
+ ASSERT(pgoff + len <= PAGE_SIZE);
- kaddr = kmap_atomic(page);
- shash->tfm = fs_info->csum_shash;
-
- crypto_shash_init(shash);
- crypto_shash_update(shash, kaddr + pgoff, len);
- crypto_shash_final(shash, csum);
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
- if (memcmp(csum, csum_expected, csum_size))
+ if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
goto zeroit;
-
- kunmap_atomic(kaddr);
return 0;
+
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
- memset(kaddr + pgoff, 1, len);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ btrfs_print_data_csum_error(BTRFS_I(inode),
+ bbio->file_offset + bio_offset,
+ csum, csum_expected, bbio->mirror_num);
+ if (bbio->device)
+ btrfs_dev_stat_inc_and_print(bbio->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ memzero_page(page, pgoff, len);
return -EIO;
}
/*
- * when reads are done, we need to check csums to verify the data is correct
+ * When reads are done, we need to check csums to verify the data is correct.
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
+ *
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @start: file offset of the range start
+ * @end: file offset of the range end (inclusive)
+ *
+ * Return a bitmap where bit set means a csum mismatch, and bit not set means
+ * csum match.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page,
+ u64 start, u64 end)
{
- size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u32 sectorsize = root->fs_info->sectorsize;
+ u32 pg_off;
+ unsigned int result = 0;
- if (PageChecked(page)) {
- ClearPageChecked(page);
+ /*
+ * This only happens for NODATASUM or compressed read.
+ * Normally this should be covered by above check for compressed read
+ * or the next check for NODATASUM. Just do a quicker exit here.
+ */
+ if (bbio->csum == NULL)
return 0;
- }
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+ if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
return 0;
- }
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
- start, (size_t)(end - start + 1));
+ ASSERT(page_offset(page) <= start &&
+ end <= page_offset(page) + PAGE_SIZE - 1);
+ for (pg_off = offset_in_page(start);
+ pg_off < offset_in_page(end);
+ pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
+ int ret;
+
+ if (btrfs_is_data_reloc_root(root) &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
+ ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
+ if (ret < 0) {
+ const int nr_bit = (pg_off - offset_in_page(start)) >>
+ root->fs_info->sectorsize_bits;
+
+ result |= (1U << nr_bit);
+ }
+ }
+ return result;
}
/*
@@ -2838,19 +3612,22 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
+ cond_resched_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
/**
- * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
- * @fs_info - the fs_info for this fs
- * @return - EINTR if we were killed, 0 if nothing's pending
+ * Wait for flushing all delayed iputs
+ *
+ * @fs_info: the filesystem
*
* This will wait on any delayed iputs that are currently running with KILLABLE
* set. Once they are all done running we will return, unless we are killed in
* which case we return EINTR. This helps in user operations like fallocate etc
* that might get blocked on the iputs.
+ *
+ * Return EINTR if we were killed, 0 if nothing's pending
*/
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
@@ -2904,7 +3681,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
- if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
+ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
path = btrfs_alloc_path();
@@ -2966,37 +3743,38 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &found_key, root);
+ inode = btrfs_iget(fs_info->sb, last_objectid, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
- * this is an orphan in the tree root. Currently these
+ * This is an orphan in the tree root. Currently these
* could come from 2 sources:
- * a) a snapshot deletion in progress
+ * a) a root (snapshot/subvolume) deletion in progress
* b) a free space cache inode
- * We need to distinguish those two, as the snapshot
- * orphan must not get deleted.
- * find_dead_roots already ran before us, so if this
- * is a snapshot deletion, we should find the root
- * in the dead_roots list
+ * We need to distinguish those two, as the orphan item
+ * for a root must not get deleted before the deletion
+ * of the snapshot/subvolume's tree completes.
+ *
+ * btrfs_find_orphan_roots() ran before us, which has
+ * found all deleted roots and loaded them into
+ * fs_info->fs_roots_radix. So here we can find if an
+ * orphan item corresponds to a deleted root by looking
+ * up the root from that radix tree.
*/
- spin_lock(&fs_info->trans_lock);
- list_for_each_entry(dead_root, &fs_info->dead_roots,
- root_list) {
- if (dead_root->root_key.objectid ==
- found_key.objectid) {
- is_dead_root = 1;
- break;
- }
- }
- spin_unlock(&fs_info->trans_lock);
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
+ if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
+ is_dead_root = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
if (is_dead_root) {
/* prevent this orphan from being found again */
key.offset = found_key.objectid - 1;
@@ -3007,7 +3785,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/*
* If we have an inode with links, there are a couple of
- * possibilities. Old kernels (before v3.12) used to create an
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
@@ -3025,8 +3810,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
- if (!ret)
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ if (ret)
+ goto out;
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3050,8 +3839,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* release the path since we're done with it */
btrfs_release_path(path);
- root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
-
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
@@ -3187,6 +3974,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
@@ -3213,7 +4002,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
@@ -3258,6 +4048,14 @@ cache_index:
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Same logic as for last_unlink_trans. We don't persist the generation
+ * of the last transaction where this inode was used for a reflink
+ * operation, so after eviction and reloading the inode we must be
+ * pessimistic and assume the last transaction that modified the inode.
+ */
+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -3305,7 +4103,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -3337,53 +4134,54 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
- &token);
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->atime,
- inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
- inode->i_atime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
- inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
- inode->i_mtime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
- inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
- inode->i_ctime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->otime,
- BTRFS_I(inode)->i_otime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec, &token);
-
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- &token);
- btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
- &token);
- btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
- &token);
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
+
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
}
/*
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
@@ -3394,9 +4192,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
- ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
- 1);
+ ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -3407,7 +4203,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, leaf, inode_item, inode);
+ fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
btrfs_mark_buffer_dirty(leaf);
btrfs_set_inode_last_trans(trans, inode);
ret = 0;
@@ -3420,7 +4216,8 @@ failed:
* copy everything in the in-memory inode into the btree.
*/
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3432,8 +4229,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* The data relocation inode should also be directly updated
* without delay
*/
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ if (!btrfs_is_free_space_inode(inode)
+ && !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
@@ -3446,9 +4243,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
return btrfs_update_inode_item(trans, root, inode);
}
-noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode)
{
int ret;
@@ -3464,11 +4260,12 @@ noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
* also drops the back refs in the inode to the directory
*/
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len)
+ const char *name, int name_len,
+ struct btrfs_rename_ctx *rename_ctx)
{
+ struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
@@ -3483,7 +4280,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto out;
}
- path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
@@ -3523,26 +4319,28 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
}
- ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto err;
+ /*
+ * If we are in a rename context, we don't need to update anything in the
+ * log. That will be done later during the rename by btrfs_log_new_name().
+ * Besides that, doing it here would only cause extra unnecessary btree
+ * operations on the log tree, increasing latency for applications.
+ */
+ if (!rename_ctx) {
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
+ index);
}
- ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
- index);
- if (ret == -ENOENT)
- ret = 0;
- else if (ret)
- btrfs_abort_transaction(trans, ret);
-
/*
* If we have a pending delayed iput we could end up with the final iput
* being run in btrfs-cleaner context. If we have enough of these built
@@ -3561,23 +4359,23 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
- dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
+ dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
+ ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
+ ret = btrfs_update_inode(trans, inode->root, inode);
}
return ret;
}
@@ -3600,13 +4398,13 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the dir index
* 1 for the inode ref
* 1 for the inode
+ * 1 for the parent inode
*/
- return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
+ return btrfs_start_transaction_fallback_global_rsv(root, 6);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
@@ -3618,7 +4416,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (ret)
@@ -3632,7 +4430,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
return ret;
}
@@ -3686,7 +4484,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* This is a placeholder inode for a subvolume we didn't have a
* reference to at the time of the snapshot creation. In the meantime
* we could have renamed the real subvol link into our snapshot, so
- * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
* Instead simply lookup the dir_index_item for this entry so we can
* remove it. Otherwise we know we have a ref to the root and we can
* call btrfs_del_root_ref, and it _shouldn't_ fail.
@@ -3725,8 +4523,9 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
- ret = btrfs_update_inode_fallback(trans, root, dir);
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
+ ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -3799,7 +4598,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (!BTRFS_FS_ERROR(fs_info))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
@@ -3863,7 +4662,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
struct btrfs_block_rsv block_rsv;
u64 root_flags;
int ret;
- int err;
/*
* Don't allow to delete a subvolume with send in progress. This is
@@ -3878,6 +4676,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
return -EPERM;
}
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ return -EPERM;
+ }
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -3885,8 +4690,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
down_write(&fs_info->subvol_sem);
- err = may_destroy_subvol(dest);
- if (err)
+ ret = may_destroy_subvol(dest);
+ if (ret)
goto out_up_write;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
@@ -3895,13 +4700,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* two for dir entries,
* two for root ref/backref.
*/
- err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
- if (err)
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+ if (ret)
goto out_up_write;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_release;
}
trans->block_rsv = &block_rsv;
@@ -3911,16 +4716,19 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
- err = ret;
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
- btrfs_record_root_in_trans(trans, dest);
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
- dest->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&dest->root_item, 0);
btrfs_set_root_refs(&dest->root_item, 0);
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
@@ -3929,7 +4737,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
}
@@ -3939,7 +4746,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
@@ -3949,23 +4755,22 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
}
+ free_anon_bdev(dest->anon_dev);
+ dest->anon_dev = 0;
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
ret = btrfs_end_transaction(trans);
- if (ret && !err)
- err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
- if (err) {
+ if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
@@ -3975,29 +4780,29 @@ out_up_write:
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
-
- /* the last ref */
- if (dest->ino_cache_inode) {
- iput(dest->ino_cache_inode);
- dest->ino_cache_inode = NULL;
- }
}
- return err;
+ return ret;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int err = 0;
- struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
return btrfs_delete_subvolume(dir, dentry);
+ }
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4015,7 +4820,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ err = btrfs_unlink_inode(trans, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (!err) {
@@ -4036,348 +4841,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(root->fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
}
/*
- * Return this if we need to call truncate_block for the last bit of the
- * truncate.
- */
-#define NEED_TRUNCATE_BLOCK 1
-
-/*
- * this can truncate away extent items, csum items and directory items.
- * It starts at a high offset and removes keys until it can't find
- * any higher than new_size
- *
- * csum items that cross the new i_size are truncated to the new size
- * as well.
- *
- * min_type is the minimum key type to truncate down to. If set to 0, this
- * will kill all the items on this inode, including the INODE_ITEM_KEY.
- */
-int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode,
- u64 new_size, u32 min_type)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *fi;
- struct btrfs_key key;
- struct btrfs_key found_key;
- u64 extent_start = 0;
- u64 extent_num_bytes = 0;
- u64 extent_offset = 0;
- u64 item_end = 0;
- u64 last_size = new_size;
- u32 found_type = (u8)-1;
- int found_extent;
- int del_item;
- int pending_del_nr = 0;
- int pending_del_slot = 0;
- int extent_type = -1;
- int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
- u64 bytes_deleted = 0;
- bool be_nice = false;
- bool should_throttle = false;
- const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
- struct extent_state *cached_state = NULL;
-
- BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
-
- /*
- * for non-free space inodes and ref cows, we want to back off from
- * time to time
- */
- if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
- test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- be_nice = true;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_BACK;
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
- &cached_state);
-
- /*
- * We want to drop from the next block forward in case this new size is
- * not block aligned since we will be keeping the last block of the
- * extent just the way it is.
- */
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root)
- btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
- fs_info->sectorsize),
- (u64)-1, 0);
-
- /*
- * This function is also used to drop the items in the log tree before
- * we relog the inode, so if root != BTRFS_I(inode)->root, it means
- * it is used to drop the logged items. So we shouldn't kill the delayed
- * items.
- */
- if (min_type == 0 && root == BTRFS_I(inode)->root)
- btrfs_kill_delayed_inode_items(BTRFS_I(inode));
-
- key.objectid = ino;
- key.offset = (u64)-1;
- key.type = (u8)-1;
-
-search_again:
- /*
- * with a 16K leaf size and 128MB extents, you can actually queue
- * up a huge file in a single leaf. Most of the time that
- * bytes_deleted is > 0, it will be huge by the time we get here
- */
- if (be_nice && bytes_deleted > SZ_32M &&
- btrfs_should_end_transaction(trans)) {
- ret = -EAGAIN;
- goto out;
- }
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = 0;
- /* there are no items in the tree for us to truncate, we're
- * done
- */
- if (path->slots[0] == 0)
- goto out;
- path->slots[0]--;
- }
-
- while (1) {
- fi = NULL;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
-
- if (found_key.objectid != ino)
- break;
-
- if (found_type < min_type)
- break;
-
- item_end = found_key.offset;
- if (found_type == BTRFS_EXTENT_DATA_KEY) {
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(leaf, fi);
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- item_end +=
- btrfs_file_extent_num_bytes(leaf, fi);
-
- trace_btrfs_truncate_show_fi_regular(
- BTRFS_I(inode), leaf, fi,
- found_key.offset);
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- item_end += btrfs_file_extent_ram_bytes(leaf,
- fi);
-
- trace_btrfs_truncate_show_fi_inline(
- BTRFS_I(inode), leaf, fi, path->slots[0],
- found_key.offset);
- }
- item_end--;
- }
- if (found_type > min_type) {
- del_item = 1;
- } else {
- if (item_end < new_size)
- break;
- if (found_key.offset >= new_size)
- del_item = 1;
- else
- del_item = 0;
- }
- found_extent = 0;
- /* FIXME, shrink the extent if the ref count is only 1 */
- if (found_type != BTRFS_EXTENT_DATA_KEY)
- goto delete;
-
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- u64 num_dec;
- extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (!del_item) {
- u64 orig_num_bytes =
- btrfs_file_extent_num_bytes(leaf, fi);
- extent_num_bytes = ALIGN(new_size -
- found_key.offset,
- fs_info->sectorsize);
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_num_bytes);
- num_dec = (orig_num_bytes -
- extent_num_bytes);
- if (test_bit(BTRFS_ROOT_REF_COWS,
- &root->state) &&
- extent_start != 0)
- inode_sub_bytes(inode, num_dec);
- btrfs_mark_buffer_dirty(leaf);
- } else {
- extent_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf,
- fi);
- extent_offset = found_key.offset -
- btrfs_file_extent_offset(leaf, fi);
-
- /* FIXME blocksize != 4096 */
- num_dec = btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_start != 0) {
- found_extent = 1;
- if (test_bit(BTRFS_ROOT_REF_COWS,
- &root->state))
- inode_sub_bytes(inode, num_dec);
- }
- }
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- /*
- * we can't truncate inline items that have had
- * special encodings
- */
- if (!del_item &&
- btrfs_file_extent_encryption(leaf, fi) == 0 &&
- btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
- btrfs_file_extent_compression(leaf, fi) == 0) {
- u32 size = (u32)(new_size - found_key.offset);
-
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(path, size, 1);
- } else if (!del_item) {
- /*
- * We have to bail so the last_size is set to
- * just before this extent.
- */
- ret = NEED_TRUNCATE_BLOCK;
- break;
- }
-
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- inode_sub_bytes(inode, item_end + 1 - new_size);
- }
-delete:
- if (del_item)
- last_size = found_key.offset;
- else
- last_size = new_size;
- if (del_item) {
- if (!pending_del_nr) {
- /* no pending yet, add ourselves */
- pending_del_slot = path->slots[0];
- pending_del_nr = 1;
- } else if (pending_del_nr &&
- path->slots[0] + 1 == pending_del_slot) {
- /* hop on the pending chunk */
- pending_del_nr++;
- pending_del_slot = path->slots[0];
- } else {
- BUG();
- }
- } else {
- break;
- }
- should_throttle = false;
-
- if (found_extent &&
- (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root)) {
- struct btrfs_ref ref = { 0 };
-
- bytes_deleted += extent_num_bytes;
-
- btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0);
- ref.real_root = root->root_key.objectid;
- btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- ino, extent_offset);
- ret = btrfs_free_extent(trans, &ref);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- if (be_nice) {
- if (btrfs_should_throttle_delayed_refs(trans))
- should_throttle = true;
- }
- }
-
- if (found_type == BTRFS_INODE_ITEM_KEY)
- break;
-
- if (path->slots[0] == 0 ||
- path->slots[0] != pending_del_slot ||
- should_throttle) {
- if (pending_del_nr) {
- ret = btrfs_del_items(trans, root, path,
- pending_del_slot,
- pending_del_nr);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- break;
- }
- pending_del_nr = 0;
- }
- btrfs_release_path(path);
-
- /*
- * We can generate a lot of delayed refs, so we need to
- * throttle every once and a while and make sure we're
- * adding enough space to keep up with the work we are
- * generating. Since we hold a transaction here we
- * can't flush, and we don't want to FLUSH_LIMIT because
- * we could have generated too many delayed refs to
- * actually allocate, so just bail if we're short and
- * let the normal reservation dance happen higher up.
- */
- if (should_throttle) {
- ret = btrfs_delayed_refs_rsv_refill(fs_info,
- BTRFS_RESERVE_NO_FLUSH);
- if (ret) {
- ret = -EAGAIN;
- break;
- }
- }
- goto search_again;
- } else {
- path->slots[0]--;
- }
- }
-out:
- if (ret >= 0 && pending_del_nr) {
- int err;
-
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
- }
- }
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ASSERT(last_size >= new_size);
- if (!ret && last_size > new_size)
- last_size = new_size;
- btrfs_ordered_update_i_size(inode, last_size, NULL);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
- (u64)-1, &cached_state);
- }
-
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* btrfs_truncate_block - read, zero a chunk and write a block
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
@@ -4388,21 +4857,22 @@ out:
* This will find the block for the "from" offset and cow the block and zero the
* part we want to zero. This is used with truncate and hole punching.
*/
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- char *kaddr;
+ bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
+ size_t write_bytes = blocksize;
int ret = 0;
u64 block_start;
u64 block_end;
@@ -4414,23 +4884,38 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- block_start, blocksize);
- if (ret)
+ ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
+ blocksize, false);
+ if (ret < 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
+ /* For nocow case, no need to reserve data space */
+ only_release_metadata = true;
+ } else {
+ goto out;
+ }
+ }
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ block_start, blocksize);
goto out;
-
+ }
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, data_reserved,
- block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ btrfs_delalloc_release_space(inode, data_reserved, block_start,
+ blocksize, true);
+ btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
}
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
if (!PageUptodate(page)) {
- ret = btrfs_readpage(NULL, page);
+ ret = btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
@@ -4444,78 +4929,82 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, block_start, block_end, &cached_state);
- set_page_extent_mapped(page);
+ lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
+ clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state);
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
if (offset != blocksize) {
if (!len)
len = blocksize - offset;
- kaddr = kmap(page);
if (front)
- memset(kaddr + (block_start - page_offset(page)),
- 0, offset);
+ memzero_page(page, (block_start - page_offset(page)),
+ offset);
else
- memset(kaddr + (block_start - page_offset(page)) + offset,
- 0, len);
- flush_dcache_page(page);
- kunmap(page);
+ memzero_page(page, (block_start - page_offset(page)) + offset,
+ len);
}
- ClearPageChecked(page);
- set_page_dirty(page);
- unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ btrfs_page_clear_checked(fs_info, page, block_start,
+ block_end + 1 - block_start);
+ btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
+
+ if (only_release_metadata)
+ set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, NULL, GFP_NOFS);
out_unlock:
- if (ret)
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ if (ret) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
+ }
+ btrfs_delalloc_release_extents(inode, blocksize);
unlock_page(page);
put_page(page);
out:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
extent_changeset_free(data_reserved);
return ret;
}
-static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
u64 offset, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
/*
- * Still need to make sure the inode looks like it's been updated so
- * that any holes get logged if we fsync.
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
*/
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
- }
/*
* 1 - for the one we're dropping
@@ -4526,19 +5015,24 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+ drop_args.start = offset;
+ drop_args.end = offset + len;
+ drop_args.drop_cache = true;
+
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
- offset, 0, 0, len, 0, len, 0, 0, 0);
- if (ret)
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
+ if (ret) {
btrfs_abort_transaction(trans, ret);
- else
+ } else {
+ btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
btrfs_update_inode(trans, root, inode);
+ }
btrfs_end_transaction(trans);
return ret;
}
@@ -4549,14 +5043,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
* the range between oldsize and size
*/
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
@@ -4576,11 +5069,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
- btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
- block_end - 1, &cached_state);
+ btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
+ &cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset);
if (IS_ERR(em)) {
err = PTR_ERR(em);
@@ -4589,20 +5082,27 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
}
last_byte = min(extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
+ hole_size = last_byte - cur_offset;
+
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
- hole_size = last_byte - cur_offset;
err = maybe_insert_hole(root, inode, cur_offset,
hole_size);
if (err)
break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + hole_size - 1, 0);
+
+ err = btrfs_inode_set_file_extent_range(inode,
+ cur_offset, hole_size);
+ if (err)
+ break;
+
hole_em = alloc_extent_map();
if (!hole_em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ cur_offset + hole_size - 1,
+ false);
+ btrfs_set_inode_full_sync(inode);
goto next;
}
hole_em->start = cur_offset;
@@ -4616,18 +5116,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
- while (1) {
- write_lock(&em_tree->lock);
- err = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- if (err != -EEXIST)
- break;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
- cur_offset +
- hole_size - 1, 0);
- }
+ err = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
+ } else {
+ err = btrfs_inode_set_file_extent_range(inode,
+ cur_offset, hole_size);
+ if (err)
+ break;
}
next:
free_extent_map(em);
@@ -4637,7 +5132,7 @@ next:
break;
}
free_extent_map(em);
- unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
+ unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
@@ -4658,9 +5153,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
*/
if (newsize != oldsize) {
inode_inc_iversion(inode);
- if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
}
if (newsize > oldsize) {
@@ -4671,42 +5167,48 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* truncation, it must capture all writes that happened before
* this truncation.
*/
- btrfs_wait_for_snapshot_creation(root);
- ret = btrfs_cont_expand(inode, oldsize, newsize);
+ btrfs_drew_write_lock(&root->snapshot_lock);
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
if (ret) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return ret;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return PTR_ERR(trans);
}
i_size_write(inode, newsize);
- btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
- ret = btrfs_update_inode(trans, root, inode);
- btrfs_end_write_no_snapshotting(root);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode,
+ ALIGN(newsize, fs_info->sectorsize),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -4728,7 +5230,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
return ret;
}
-static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4737,7 +5240,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -4748,60 +5251,44 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid) {
- setattr_copy(inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(inode, inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
}
/*
- * While truncating the inode pages during eviction, we get the VFS calling
- * btrfs_invalidatepage() against each page of the inode. This is slow because
- * the calls to btrfs_invalidatepage() result in a huge amount of calls to
- * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
- * extent_state structures over and over, wasting lots of time.
+ * While truncating the inode pages during eviction, we get the VFS
+ * calling btrfs_invalidate_folio() against each folio of the inode. This
+ * is slow because the calls to btrfs_invalidate_folio() result in a
+ * huge amount of calls to lock_extent() and clear_extent_bit(),
+ * which keep merging and splitting extent_state structures over and over,
+ * wasting lots of time.
*
- * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
- * those expensive operations on a per page basis and do only the ordered io
- * finishing, while we release here the extent_map and extent_state structures,
- * without the excessive merging and splitting.
+ * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
+ * skip all those expensive operations on a per folio basis and do only
+ * the ordered io finishing, while we release here the extent_map and
+ * extent_state structures, without the excessive merging and splitting.
*/
static void evict_inode_truncate_pages(struct inode *inode)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
- write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
- struct extent_map *em;
-
- node = rb_first_cached(&map_tree->map);
- em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- remove_extent_mapping(map_tree, em);
- free_extent_map(em);
- if (need_resched()) {
- write_unlock(&map_tree->lock);
- cond_resched();
- write_lock(&map_tree->lock);
- }
- }
- write_unlock(&map_tree->lock);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* Keep looping until we have no more ranges in the io tree.
- * We can have ongoing bios started by readpages (called from readahead)
- * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * We can have ongoing bios started by readahead that have
+ * their endio callback (extent_io.c:end_bio_extent_readpage)
* still in progress (unlocked the pages in the bio but did not yet
* unlocked the ranges in the io tree). Therefore this means some
* ranges can still be locked and eviction started because before
@@ -4829,22 +5316,22 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
* and its reserved space won't be freed by delayed_ref.
* So we need to free its reserved space here.
- * (Refer to comment in btrfs_invalidatepage, case 2)
+ * (Refer to comment in btrfs_invalidate_folio, case 2)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state_flags & EXTENT_DELALLOC)
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
+ end - start + 1);
clear_extent_bit(io_tree, start, end,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
&cached_state);
cond_resched();
@@ -4857,7 +5344,6 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_trans_handle *trans;
u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
int ret;
@@ -4872,18 +5358,16 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* above. We reserve our extra bit here because we generate a ton of
* delayed refs activity by truncating.
*
- * If we cannot make our reservation we'll attempt to steal from the
- * global reserve, because we really want to be able to free up space.
+ * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
+ * if we fail to make this reservation we can re-try without the
+ * delayed_refs_extra so we can make some forward progress.
*/
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_EVICT);
if (ret) {
- /*
- * Try to steal from the global reserve if there is space for
- * it.
- */
- if (btrfs_check_space_for_delayed_refs(fs_info) ||
- btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
+ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
+ BTRFS_RESERVE_FLUSH_EVICT);
+ if (ret) {
btrfs_warn(fs_info,
"could not allocate space for delete; will truncate on mount");
return ERR_PTR(-ENOSPC);
@@ -4915,6 +5399,7 @@ void btrfs_evict_inode(struct inode *inode)
trace_btrfs_inode_evict(inode);
if (!root) {
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
@@ -4941,26 +5426,45 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ /*
+ * This makes sure the inode item in tree is uptodate and the space for
+ * the inode update is released.
+ */
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
if (ret)
goto no_delete;
+ /*
+ * This drops any pending insert or delete operations we have for this
+ * inode. We could have a delayed dir index deletion queued up, but
+ * we're removing the inode completely so that'll be taken care of in
+ * the truncate.
+ */
+ btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .new_size = 0,
+ .min_type = 0,
+ };
+
trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
trans->block_rsv = rsv;
- ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -4987,10 +5491,6 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans);
}
- if (!(root == fs_info->tree_root ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
- btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
-
free_rsv:
btrfs_free_block_rsv(fs_info, rsv);
no_delete:
@@ -5000,6 +5500,7 @@ no_delete:
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
@@ -5098,7 +5599,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
btrfs_release_path(path);
- new_root = btrfs_read_fs_root_no_name(fs_info, location);
+ new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
@@ -5150,15 +5651,15 @@ static void inode_tree_add(struct inode *inode)
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -5176,29 +5677,37 @@ static void inode_tree_del(struct inode *inode)
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->location->objectid;
- memcpy(&BTRFS_I(inode)->location, args->location,
- sizeof(*args->location));
- BTRFS_I(inode)->root = args->root;
+
+ inode->i_ino = args->ino;
+ BTRFS_I(inode)->location.objectid = args->ino;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.offset = 0;
+ BTRFS_I(inode)->root = btrfs_grab_root(args->root);
+ BUG_ON(args->root && !BTRFS_I(inode)->root);
+
+ if (args->root && args->root == args->root->fs_info->tree_root &&
+ args->ino != BTRFS_BTREE_INODE_OBJECTID)
+ set_bit(BTRFS_INODE_FREE_SPACE_INODE,
+ &BTRFS_I(inode)->runtime_flags);
return 0;
}
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->location->objectid == BTRFS_I(inode)->location.objectid &&
+
+ return args->ino == BTRFS_I(inode)->location.objectid &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s,
- struct btrfs_key *location,
+static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
- unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+ unsigned long hashval = btrfs_inode_hash(ino, root);
- args.location = location;
+ args.ino = ino;
args.root = root;
inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -5208,17 +5717,17 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
}
/*
- * Get an inode object given its location and corresponding root.
+ * Get an inode object given its inode number and corresponding root.
* Path can be preallocated to prevent recursing back to iget through
* allocator. NULL is also valid but may require an additional allocation
* later.
*/
-struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
- inode = btrfs_iget_locked(s, location, root);
+ inode = btrfs_iget_locked(s, ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -5245,10 +5754,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
return inode;
}
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root)
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, location, root, NULL);
+ return btrfs_iget_path(s, ino, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
@@ -5260,7 +5768,7 @@ static struct inode *new_simple_dir(struct super_block *s,
if (!inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -5281,21 +5789,17 @@ static struct inode *new_simple_dir(struct super_block *s,
return inode;
}
+static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
+static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
+static_assert(BTRFS_FT_DIR == FT_DIR);
+static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
+static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
+static_assert(BTRFS_FT_FIFO == FT_FIFO);
+static_assert(BTRFS_FT_SOCK == FT_SOCK);
+static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
+
static inline u8 btrfs_inode_type(struct inode *inode)
{
- /*
- * Compile-time asserts that generic FT_* types still match
- * BTRFS_FT_* types
- */
- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
-
return fs_umode_to_ftype(inode->i_mode);
}
@@ -5307,7 +5811,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
struct btrfs_root *sub_root = root;
struct btrfs_key location;
u8 di_type = 0;
- int index;
int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
@@ -5318,7 +5821,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root);
+ inode = btrfs_iget(dir->i_sb, location.objectid, root);
if (IS_ERR(inode))
return inode;
@@ -5334,20 +5837,20 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return inode;
}
- index = srcu_read_lock(&fs_info->subvol_srcu);
ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
if (ret < 0) {
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
- inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ inode = new_simple_dir(dir->i_sb, &location, root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root);
- }
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
+ btrfs_put_root(sub_root);
+
+ if (IS_ERR(inode))
+ return inode;
- if (!IS_ERR(inode) && root != sub_root) {
down_read(&fs_info->cleanup_work_sem);
if (!sb_rdonly(inode->i_sb))
ret = btrfs_orphan_cleanup(sub_root);
@@ -5453,8 +5956,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct list_head ins_list;
struct list_head del_list;
int ret;
- struct extent_buffer *leaf;
- int slot;
char *name_ptr;
int name_len;
int entries = 0;
@@ -5481,35 +5982,19 @@ again:
key.offset = ctx->pos;
key.objectid = btrfs_ino(BTRFS_I(inode));
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, ret) {
struct dir_entry *entry;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ struct extent_buffer *leaf = path->nodes[0];
if (found_key.objectid != key.objectid)
break;
if (found_key.type != BTRFS_DIR_INDEX_KEY)
break;
if (found_key.offset < ctx->pos)
- goto next;
+ continue;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
- goto next;
- di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ continue;
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
@@ -5536,9 +6021,11 @@ again:
entries++;
addr += sizeof(struct dir_entry) + name_len;
total_len += sizeof(struct dir_entry) + name_len;
-next:
- path->slots[0]++;
}
+ /* Catch error encountered during iteration */
+ if (ret < 0)
+ goto err;
+
btrfs_release_path(path);
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
@@ -5599,15 +6086,15 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && ret == -ENOSPC) {
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
}
btrfs_end_transaction(trans);
if (BTRFS_I(inode)->delayed_node)
@@ -5669,14 +6156,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
goto out;
ret = 0;
- /*
- * MAGIC NUMBER EXPLANATION:
- * since we search a directory based on f_pos we have to start at 2
- * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
- * else has to start at 2
- */
if (path->slots[0] == 0) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -5687,7 +6168,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -5723,7 +6204,8 @@ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.location = &BTRFS_I(inode)->location;
+
+ args.ino = BTRFS_I(inode)->location.objectid;
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -5731,6 +6213,57 @@ static int btrfs_insert_inode_locked(struct inode *inode)
btrfs_find_actor, &args);
}
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items)
+{
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ int ret;
+
+ ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
+ if (ret)
+ return ret;
+
+ /* 1 to add inode item */
+ *trans_num_items = 1;
+ /* 1 to add compression property */
+ if (BTRFS_I(dir)->prop_compress)
+ (*trans_num_items)++;
+ /* 1 to add default ACL xattr */
+ if (args->default_acl)
+ (*trans_num_items)++;
+ /* 1 to add access ACL xattr */
+ if (args->acl)
+ (*trans_num_items)++;
+#ifdef CONFIG_SECURITY
+ /* 1 to add LSM xattr */
+ if (dir->i_security)
+ (*trans_num_items)++;
+#endif
+ if (args->orphan) {
+ /* 1 to add orphan item */
+ (*trans_num_items)++;
+ } else {
+ /*
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
+ *
+ * No need for 1 unit for the inode ref item because it is
+ * inserted in a batch together with the inode item at
+ * btrfs_create_new_inode().
+ */
+ *trans_num_items += 3;
+ }
+ return 0;
+}
+
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
+{
+ posix_acl_release(args->acl);
+ posix_acl_release(args->default_acl);
+}
+
/*
* Inherit flags from the parent inode.
*
@@ -5740,9 +6273,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
unsigned int flags;
- if (!dir)
- return;
-
flags = BTRFS_I(dir)->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
@@ -5762,81 +6292,92 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
btrfs_sync_inode_flags_to_i_flags(inode);
}
-static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *dir,
- const char *name, int name_len,
- u64 ref_objectid, u64 objectid,
- umode_t mode, u64 *index)
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ const char *name = args->orphan ? NULL : args->dentry->d_name.name;
+ int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
struct btrfs_path *path;
+ u64 objectid;
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
- int nitems = name ? 2 : 1;
+ struct btrfs_item_batch batch;
unsigned long ptr;
- unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
-
- nofs_flag = memalloc_nofs_save();
- inode = new_inode(fs_info->sb);
- memalloc_nofs_restore(nofs_flag);
- if (!inode) {
- btrfs_free_path(path);
- return ERR_PTR(-ENOMEM);
- }
+ return -ENOMEM;
- /*
- * O_TMPFILE, set link count to 0, so that after this point,
- * we fill in an inode item with the correct link count.
- */
- if (!name)
- set_nlink(inode, 0);
+ if (!args->subvol)
+ BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
+ root = BTRFS_I(inode)->root;
- /*
- * we have to initialize this early, so we can reclaim the inode
- * number if we fail afterwards in this function.
- */
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
+ goto out;
inode->i_ino = objectid;
- if (dir && name) {
+ if (args->orphan) {
+ /*
+ * O_TMPFILE, set link count to 0, so that after this point, we
+ * fill in an inode item with the correct link count.
+ */
+ set_nlink(inode, 0);
+ } else {
trace_btrfs_inode_request(dir);
- ret = btrfs_set_inode_index(BTRFS_I(dir), index);
- if (ret) {
- btrfs_free_path(path);
- iput(inode);
- return ERR_PTR(ret);
- }
- } else if (dir) {
- *index = 0;
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
+ if (ret)
+ goto out;
}
- /*
- * index_cnt is ignored for everything but a dir,
- * btrfs_set_inode_index_count has an explanation for the magic
- * number
- */
- BTRFS_I(inode)->index_cnt = 2;
- BTRFS_I(inode)->dir_index = *index;
- BTRFS_I(inode)->root = root;
+ /* index_cnt is ignored for everything but a dir. */
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * Subvolumes don't inherit flags from their parent directory.
+ * Originally this was probably by accident, but we probably can't
+ * change it now without compatibility issues.
+ */
+ if (!args->subvol)
+ btrfs_inherit_iflags(inode, dir);
+
+ if (S_ISREG(inode->i_mode)) {
+ if (btrfs_test_opt(fs_info, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(fs_info, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ }
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0) {
+ if (!args->orphan)
+ BTRFS_I(dir)->index_cnt--;
+ goto out;
+ }
+
+ /*
* We could have gotten an inode number from somebody who was fsynced
* and then removed in this same transaction, so let's just set full
* sync since it will be a full sync anyway and this will blow away the
* old info in the log.
*/
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
key[0].objectid = objectid;
key[0].type = BTRFS_INODE_ITEM_KEY;
@@ -5844,7 +6385,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
- if (name) {
+ if (!args->orphan) {
/*
* Start new inodes with an inode_ref. This is slightly more
* efficient for small numbers of hard links since they will
@@ -5853,85 +6394,133 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
key[1].objectid = objectid;
key[1].type = BTRFS_INODE_REF_KEY;
- key[1].offset = ref_objectid;
-
- sizes[1] = name_len + sizeof(*ref);
+ if (args->subvol) {
+ key[1].offset = objectid;
+ sizes[1] = 2 + sizeof(*ref);
+ } else {
+ key[1].offset = btrfs_ino(BTRFS_I(dir));
+ sizes[1] = name_len + sizeof(*ref);
+ }
}
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
- ret = btrfs_insert_inode_locked(inode);
- if (ret < 0) {
- iput(inode);
- goto fail;
+ batch.keys = &key[0];
+ batch.data_sizes = &sizes[0];
+ batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
+ batch.nr = args->orphan ? 1 : 2;
+ ret = btrfs_insert_empty_items(trans, root, path, &batch);
+ if (ret != 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
}
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
- if (ret != 0)
- goto fail_unlock;
-
- inode_init_owner(inode, dir, mode);
- inode_set_bytes(inode, 0);
-
inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
+ /*
+ * We're going to fill the inode item now, so at this point the inode
+ * must be fully initialized.
+ */
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- if (name) {
+ if (!args->orphan) {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (args->subvol) {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
+ write_extent_buffer(path->nodes[0], "..", ptr, 2);
+ } else {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref,
+ BTRFS_I(inode)->dir_index);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
}
btrfs_mark_buffer_dirty(path->nodes[0]);
+ /*
+ * We don't need the path anymore, plus inheriting properties, adding
+ * ACLs, security xattrs, orphan item or adding the link, will result in
+ * allocating yet another path. So just free our path.
+ */
btrfs_free_path(path);
+ path = NULL;
- btrfs_inherit_iflags(inode, dir);
+ if (args->subvol) {
+ struct inode *parent;
- if (S_ISREG(mode)) {
- if (btrfs_test_opt(fs_info, NODATASUM))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(fs_info, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ /*
+ * Subvolumes inherit properties from their parent subvolume,
+ * not the directory they were created in.
+ */
+ parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_I(dir)->root);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, parent);
+ iput(parent);
+ }
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ }
+ if (ret) {
+ btrfs_err(fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
+ ret);
+ }
+
+ /*
+ * Subvolumes don't inherit ACLs or get passed to the LSM. This is
+ * probably a bug.
+ */
+ if (!args->subvol) {
+ ret = btrfs_init_inode_security(trans, args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
- btrfs_set_inode_last_trans(trans, inode);
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
btrfs_update_root_times(trans, root);
- ret = btrfs_inode_inherit_props(trans, inode, dir);
- if (ret)
- btrfs_err(fs_info,
- "error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
+ if (args->orphan) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ } else {
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ name_len, 0, BTRFS_I(inode)->dir_index);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
- return inode;
+ return 0;
-fail_unlock:
+discard:
+ /*
+ * discard_new_inode() calls iput(), but the caller owns the reference
+ * to the inode.
+ */
+ ihold(inode);
discard_new_inode(inode);
-fail:
- if (dir && name)
- BTRFS_I(dir)->index_cnt--;
+out:
btrfs_free_path(path);
- return ERR_PTR(ret);
+ return ret;
}
/*
@@ -5995,7 +6584,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
parent_inode->vfs_inode.i_mtime = now;
parent_inode->vfs_inode.i_ctime = now;
}
- ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
@@ -6023,148 +6612,71 @@ fail_dir_item:
return ret;
}
-static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
- struct btrfs_inode *dir, struct dentry *dentry,
- struct btrfs_inode *inode, int backref, u64 index)
-{
- int err = btrfs_add_link(trans, dir, inode,
- dentry->d_name.name, dentry->d_name.len,
- backref, index);
- if (err > 0)
- err = -EEXIST;
- return err;
-}
-
-static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .inode = inode,
+ };
+ unsigned int trans_num_items;
+ struct btrfs_trans_handle *trans;
int err;
- u64 objectid;
- u64 index = 0;
-
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- btrfs_update_inode(trans, root, inode);
- d_instantiate_new(dentry, inode);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!err)
+ d_instantiate_new(dentry, inode);
-out_unlock:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
-static int btrfs_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, bool excl)
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- int err;
- u64 objectid;
- u64 index = 0;
+ struct inode *inode;
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ return btrfs_create_common(dir, dentry, inode);
+}
- err = btrfs_find_free_ino(root, &objectid);
- if (err)
- goto out_unlock;
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct inode *inode;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
- }
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_update_inode(trans, root, inode);
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- d_instantiate_new(dentry, inode);
-
-out_unlock:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6210,16 +6722,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 1, index);
+ err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ dentry->d_name.name, dentry->d_name.len, 1, index);
if (err) {
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
@@ -6232,12 +6743,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
fail:
@@ -6251,67 +6757,18 @@ fail:
return err;
}
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- int err = 0;
- u64 objectid = 0;
- u64 index = 0;
-
- /*
- * 2 items for inode and ref
- * 2 items for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- err = btrfs_find_free_ino(root, &objectid);
- if (err)
- goto out_fail;
-
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFDIR | mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_fail;
- }
+ struct inode *inode;
- /* these must be set before we unlock the inode */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_fail;
-
- btrfs_i_size_write(BTRFS_I(inode), 0);
- err = btrfs_update_inode(trans, root, inode);
- if (err)
- goto out_fail;
-
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- dentry->d_name.name,
- dentry->d_name.len, 0, index);
- if (err)
- goto out_fail;
-
- d_instantiate_new(dentry, inode);
-
-out_fail:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6330,8 +6787,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
- inline_size = btrfs_file_extent_inline_item_len(leaf,
- btrfs_item_nr(path->slots[0]));
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
tmp = kmalloc(inline_size, GFP_NOFS);
if (!tmp)
return -ENOMEM;
@@ -6351,11 +6807,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size + pg_offset < PAGE_SIZE) {
- char *map = kmap(page);
- memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
- kunmap(page);
- }
+ if (max_size + pg_offset < PAGE_SIZE)
+ memzero_page(page, pg_offset + max_size,
+ PAGE_SIZE - max_size - pg_offset);
kfree(tmp);
return ret;
}
@@ -6383,8 +6837,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6396,7 +6849,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_io_tree *io_tree = &inode->io_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -6412,7 +6864,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
@@ -6422,7 +6874,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -6430,19 +6882,23 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path->reada = READA_FORWARD;
/*
- * Unless we're going to uncompress the inline extent, no sleep would
- * happen.
+ * The same explanation in load_free_space_cache applies here as well,
+ * we only read when we're loading the free space cache, and at that
+ * point the commit_root has everything we need.
*/
- path->leave_spinning = 1;
+ if (btrfs_is_free_space_inode(inode)) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -6463,6 +6919,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
+ extent_end = btrfs_file_extent_end(path);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
@@ -6473,18 +6930,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
btrfs_ino(inode));
goto out;
}
- extent_end = extent_start +
- btrfs_file_extent_num_bytes(leaf, item);
-
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- size_t size;
-
- size = btrfs_file_extent_ram_bytes(leaf, item);
- extent_end = ALIGN(extent_start + size,
- fs_info->sectorsize);
-
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
path->slots[0],
extent_start);
@@ -6494,12 +6942,11 @@ next:
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6544,18 +6991,15 @@ next:
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
- btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
- map = kmap(page);
+ map = kmap_local_page(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
if (pg_offset + copy_size < PAGE_SIZE) {
@@ -6563,12 +7007,10 @@ next:
PAGE_SIZE - pg_offset -
copy_size);
}
- kunmap(page);
+ kunmap_local(map);
}
flush_dcache_page(page);
}
- set_extent_uptodate(io_tree, em->start,
- extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
}
not_found:
@@ -6577,160 +7019,32 @@ not_found:
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
- free_extent_map(em);
- return ERR_PTR(err);
- }
- BUG_ON(!em); /* Error is always set */
- return em;
-}
-
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len)
-{
- struct extent_map *em;
- struct extent_map *hole_em = NULL;
- u64 delalloc_start = start;
- u64 end;
- u64 delalloc_len;
- u64 delalloc_end;
- int err = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, start, len);
- if (IS_ERR(em))
- return em;
- /*
- * If our em maps to:
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
-
- /* check to see if we've wrapped (len == -1 or similar) */
- end = start + len;
- if (end < start)
- end = (u64)-1;
- else
- end -= 1;
-
- em = NULL;
-
- /* ok, we didn't find anything, lets look for delalloc */
- delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
- end, len, EXTENT_DELALLOC, 1);
- delalloc_end = delalloc_start + delalloc_len;
- if (delalloc_end < delalloc_start)
- delalloc_end = (u64)-1;
-
- /*
- * We didn't find anything useful, return the original results from
- * get_extent()
- */
- if (delalloc_start > end || delalloc_end <= start) {
- em = hole_em;
- hole_em = NULL;
- goto out;
- }
-
- /*
- * Adjust the delalloc_start to make sure it doesn't go backwards from
- * the start they passed in
- */
- delalloc_start = max(start, delalloc_start);
- delalloc_len = delalloc_end - delalloc_start;
-
- if (delalloc_len > 0) {
- u64 hole_start;
- u64 hole_len;
- const u64 hole_end = extent_map_end(hole_em);
-
- em = alloc_extent_map();
- if (!em) {
- err = -ENOMEM;
- goto out;
- }
-
- ASSERT(hole_em);
- /*
- * When btrfs_get_extent can't find anything it returns one
- * huge hole
- *
- * Make sure what it found really fits our range, and adjust to
- * make sure it is based on the start from the caller
- */
- if (hole_end <= start || hole_em->start > end) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = hole_end - hole_start;
- }
-
- if (hole_em && delalloc_start > hole_start) {
- /*
- * Our hole starts before our delalloc, so we have to
- * return just the parts of the hole that go until the
- * delalloc starts
- */
- em->len = min(hole_len, delalloc_start - hole_start);
- em->start = hole_start;
- em->orig_start = hole_start;
- /*
- * Don't adjust block start at all, it is fixed at
- * EXTENT_MAP_HOLE
- */
- em->block_start = hole_em->block_start;
- em->block_len = hole_len;
- if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- } else {
- /*
- * Hole is out of passed range or it starts after
- * delalloc range
- */
- em->start = delalloc_start;
- em->len = delalloc_len;
- em->orig_start = delalloc_start;
- em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = delalloc_len;
- }
- } else {
- return hole_em;
- }
-out:
-
- free_extent_map(hole_em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
const u64 orig_start,
@@ -6744,21 +7058,23 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
int ret;
if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start,
- block_start, block_len, orig_block_len,
- ram_bytes,
+ em = create_io_em(inode, start, len, orig_start, block_start,
+ block_len, orig_block_len, ram_bytes,
BTRFS_COMPRESS_NONE, /* compress_type */
type);
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
- len, block_len, type);
+ ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+ block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
- start + len - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + len - 1, false);
}
em = ERR_PTR(ret);
}
@@ -6767,11 +7083,11 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
return em;
}
-static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
u64 start, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
@@ -6788,21 +7104,51 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
return em;
}
+static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ bool readonly = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = true;
+ if (block_group)
+ btrfs_put_block_group(block_group);
+ return readonly;
+}
+
/*
- * returns 1 when the nocow is safe, < 1 on error, 0 if the
- * block must be cow'd
+ * Check if we can do nocow write into the range [@offset, @offset + @len)
+ *
+ * @offset: File offset
+ * @len: The length to write, will be updated to the nocow writeable
+ * range
+ * @orig_start: (optional) Return the original file offset of the file extent
+ * @orig_len: (optional) Return the original on-disk length of the file extent
+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
+ * @strict: if true, omit optimizations that might force us into unnecessary
+ * cow. e.g., don't trust generation number.
+ *
+ * Return:
+ * >0 and update @len if we can do nocow write
+ * 0 if we can't do nocow write
+ * <0 if error happened
+ *
+ * NOTE: This only checks the file extents, caller is responsible to wait for
+ * any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+ u64 *ram_bytes, bool nowait, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
@@ -6810,35 +7156,29 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 disk_bytenr;
- u64 backref_offset;
- u64 extent_end;
- u64 num_bytes;
- int slot;
int found_type;
- bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), offset, 0);
if (ret < 0)
goto out;
- slot = path->slots[0];
if (ret == 1) {
- if (slot == 0) {
+ if (path->slots[0] == 0) {
/* can't find the item, must cow */
ret = 0;
goto out;
}
- slot--;
+ path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
@@ -6850,54 +7190,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
goto out;
}
- fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(leaf, fi);
- if (found_type != BTRFS_FILE_EXTENT_REG &&
- found_type != BTRFS_FILE_EXTENT_PREALLOC) {
- /* not a regular extent, must cow */
- goto out;
- }
-
- if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ if (btrfs_file_extent_end(path) <= offset)
goto out;
- extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end <= offset)
- goto out;
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (ram_bytes)
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (disk_bytenr == 0)
- goto out;
+ nocow_args.start = offset;
+ nocow_args.end = offset + *len - 1;
+ nocow_args.strict = strict;
+ nocow_args.free_path = true;
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out;
+ ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ /* can_nocow_file_extent() has freed the path. */
+ path = NULL;
- /*
- * Do the same check as in btrfs_cross_ref_exist but without the
- * unnecessary search.
- */
- if (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
+ if (ret != 1) {
+ /* Treat errors as not being able to NOCOW. */
+ ret = 0;
goto out;
-
- backref_offset = btrfs_file_extent_offset(leaf, fi);
-
- if (orig_start) {
- *orig_start = key.offset - backref_offset;
- *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
- if (btrfs_extent_readonly(fs_info, disk_bytenr))
+ ret = 0;
+ if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
goto out;
- num_bytes = min(offset + *len, extent_end) - offset;
- if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + num_bytes,
+ range_end = round_up(offset + nocow_args.num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit(io_tree, offset, range_end,
EXTENT_DELALLOC, 0, NULL);
@@ -6907,35 +7231,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
}
- btrfs_release_path(path);
-
- /*
- * look for other files referencing this extent, if we
- * find any we must cow
- */
-
- ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr);
- if (ret) {
- ret = 0;
- goto out;
- }
+ if (orig_start)
+ *orig_start = key.offset - nocow_args.extent_offset;
+ if (orig_block_len)
+ *orig_block_len = nocow_args.disk_num_bytes;
- /*
- * adjust disk_bytenr and num_bytes to cover just the bytes
- * in this extent we are about to write. If there
- * are any csums in that range we have to cow in order
- * to keep the csums correct
- */
- disk_bytenr += backref_offset;
- disk_bytenr += offset - key.offset;
- if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
- goto out;
- /*
- * all of the above have passed, it is safe to overwrite this extent
- * without cow
- */
- *len = num_bytes;
+ *len = nocow_args.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
@@ -6943,14 +7244,22 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
int ret = 0;
while (1) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ lock_extent(io_tree, lockstart, lockend, cached_state);
+ }
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure there's no ordered
@@ -6971,10 +7280,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
/*
* If we are doing a DIO read and the ordered extent we
* found is for a buffered write, we can not wait for it
@@ -6992,9 +7305,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else {
/*
@@ -7002,15 +7315,15 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* for it to complete) and then invalidate the pages for
* this range (through invalidate_inode_pages2_range()),
* but that can lead us to a deadlock with a concurrent
- * call to readpages() (a buffered read or a defrag call
+ * call to readahead (a buffered read or a defrag call
* triggered a readahead) on a page lock due to an
* ordered dio extent we created before but did not have
* yet a corresponding bio submitted (whence it can not
- * complete), which makes readpages() wait for that
+ * complete), which makes readahead wait for that
* ordered extent to complete while holding a lock on
* that page.
*/
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
}
if (ret)
@@ -7023,13 +7336,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
}
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
- u64 orig_start, u64 block_start,
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
int ret;
@@ -7038,7 +7350,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &BTRFS_I(inode)->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7059,18 +7370,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
em->compress_type = compress_type;
}
- do {
- btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
- em->start + em->len - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- /*
- * The caller has taken lock_extent(), who could race with us
- * to add em?
- */
- } while (ret == -EEXIST);
-
+ ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
free_extent_map(em);
return ERR_PTR(ret);
@@ -7081,36 +7381,21 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
+ u64 start, u64 len,
+ unsigned int iomap_flags)
{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
+ int type;
+ u64 block_start, orig_start, orig_block_len, ram_bytes;
+ struct btrfs_block_group *bg;
+ bool can_nocow = false;
+ bool space_reserved = false;
+ u64 prev_len;
int ret = 0;
/*
@@ -7125,9 +7410,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
-
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
type = BTRFS_ORDERED_PREALLOC;
else
@@ -7136,108 +7418,207 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start)) {
- struct extent_map *em2;
-
- em2 = btrfs_create_dio_extent(inode, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(fs_info, block_start);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em = em2;
- }
-
- if (em2 && IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
- /*
- * For inode marked NODATACOW or extent marked PREALLOC,
- * use the existing or preallocated extent, so does not
- * need to adjust btrfs_space_info's bytes_may_use.
- */
- btrfs_free_reserved_data_space_noquota(inode, start,
- len);
- goto skip_cow;
+ &orig_block_len, &ram_bytes, false, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
}
}
- /* this will cow the extent */
- len = bh_result->b_size;
- free_extent_map(em);
- *map = em = btrfs_new_extent_direct(inode, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
+ prev_len = len;
+ if (can_nocow) {
+ struct extent_map *em2;
- len = min(len, em->len - (start - em->start));
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
+ goto out;
+ }
+ space_reserved = true;
-skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+ btrfs_dec_nocow_writers(bg);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em2;
+ em = em2;
+ }
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+
+ dio_data->nocow_done = true;
+ } else {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ if (nowait)
+ return -EAGAIN;
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved)
+ return -ENOSPC;
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
+ }
+
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
-
- WARN_ON(dio_data->reserve < len);
- dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ }
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
+ struct btrfs_dio_data *dio_data = iter->private;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ const u64 data_alloc_len = length;
+ bool unlock_extents = false;
- if (!create)
- len = min_t(u64, len, fs_info->sectorsize);
+ /*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
+ if (!write)
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+ }
+
+ memset(dio_data, 0, sizeof(*dio_data));
+
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len, false);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
}
/*
* If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered.
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
- ret = -ENOTBLK;
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
goto err;
- }
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
@@ -7262,1128 +7643,737 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- ret = -ENOTBLK;
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len, flags);
if (ret < 0)
goto unlock_err;
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
- } else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
}
+ } else {
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
}
+ if (unlock_extents)
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+ iomap->length = len;
+
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
+ iomap->flags |= IOMAP_F_ZONE_APPEND;
+
free_extent_map(em);
return 0;
unlock_err:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
- return ret;
-}
-
-static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
- struct bio *bio,
- int mirror_num)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
-
- BUG_ON(bio_op(bio) == REQ_OP_WRITE);
-
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
- if (ret)
- return ret;
-
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
return ret;
}
-static int btrfs_check_dio_repairable(struct inode *inode,
- struct bio *failed_bio,
- struct io_failure_record *failrec,
- int failed_mirror)
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int num_copies;
-
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- if (num_copies == 1) {
- /*
- * we only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. no
- * matter what the error is, it is very likely to persist.
- */
- btrfs_debug(fs_info,
- "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return 0;
- }
-
- failrec->failed_mirror = failed_mirror;
- failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
- if (failrec->this_mirror > num_copies) {
- btrfs_debug(fs_info,
- "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
return 0;
}
- return 1;
-}
-
-static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- bio_end_io_t *repair_endio, void *repair_arg)
-{
- struct io_failure_record *failrec;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct bio *bio;
- int isector;
- unsigned int read_mode = 0;
- int segs;
- int ret;
- blk_status_t status;
- struct bio_vec bvec;
-
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
-
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
- if (ret)
- return errno_to_blk_status(ret);
-
- ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
- failed_mirror);
- if (!ret) {
- free_io_failure(failure_tree, io_tree, failrec);
- return BLK_STS_IOERR;
- }
-
- segs = bio_segments(failed_bio);
- bio_get_first_bvec(failed_bio, &bvec);
- if (segs > 1 ||
- (bvec.bv_len > btrfs_inode_sectorsize(inode)))
- read_mode |= REQ_FAILFAST_DEV;
-
- isector = start - btrfs_io_bio(failed_bio)->logical;
- isector >>= inode->i_sb->s_blocksize_bits;
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- pgoff, isector, repair_endio, repair_arg);
- bio->bi_opf = REQ_OP_READ | read_mode;
-
- btrfs_debug(BTRFS_I(inode)->root->fs_info,
- "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
- read_mode, failrec->this_mirror, failrec->in_validation);
-
- status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
- if (status) {
- free_io_failure(failure_tree, io_tree, failrec);
- bio_put(bio);
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ pos, length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
+ ret = -ENOTBLK;
}
- return status;
-}
-
-struct btrfs_retry_complete {
- struct completion done;
- struct inode *inode;
- u64 start;
- int uptodate;
-};
-
-static void btrfs_retry_endio_nocsum(struct bio *bio)
-{
- struct btrfs_retry_complete *done = bio->bi_private;
- struct inode *inode = done->inode;
- struct bio_vec *bvec;
- struct extent_io_tree *io_tree, *failure_tree;
- struct bvec_iter_all iter_all;
-
- if (bio->bi_status)
- goto end;
-
- ASSERT(bio->bi_vcnt == 1);
- io_tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
-
- done->uptodate = 1;
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
- io_tree, done->start, bvec->bv_page,
- btrfs_ino(BTRFS_I(inode)), 0);
-end:
- complete(&done->done);
- bio_put(bio);
+ if (write)
+ extent_changeset_free(dio_data->data_reserved);
+ return ret;
}
-static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
- struct btrfs_io_bio *io_bio)
+static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
{
- struct btrfs_fs_info *fs_info;
- struct bio_vec bvec;
- struct bvec_iter iter;
- struct btrfs_retry_complete done;
- u64 start;
- unsigned int pgoff;
- u32 sectorsize;
- int nr_sectors;
- blk_status_t ret;
- blk_status_t err = BLK_STS_OK;
-
- fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = fs_info->sectorsize;
-
- start = io_bio->logical;
- done.inode = inode;
- io_bio->bio.bi_iter = io_bio->iter;
-
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- pgoff = bvec.bv_offset;
-
-next_block_or_try_again:
- done.uptodate = 0;
- done.start = start;
- init_completion(&done.done);
-
- ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- pgoff, start, start + sectorsize - 1,
- io_bio->mirror_num,
- btrfs_retry_endio_nocsum, &done);
- if (ret) {
- err = ret;
- goto next;
- }
-
- wait_for_completion_io(&done.done);
-
- if (!done.uptodate) {
- /* We might have another mirror, so try again */
- goto next_block_or_try_again;
- }
-
-next:
- start += sectorsize;
+ /*
+ * This implies a barrier so that stores to dio_bio->bi_status before
+ * this and loads of dio_bio->bi_status after this are fully ordered.
+ */
+ if (!refcount_dec_and_test(&dip->refs))
+ return;
- nr_sectors--;
- if (nr_sectors) {
- pgoff += sectorsize;
- ASSERT(pgoff < PAGE_SIZE);
- goto next_block_or_try_again;
- }
+ if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ dip->file_offset, dip->bytes,
+ !dip->bio.bi_status);
+ } else {
+ unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
}
- return err;
+ kfree(dip->csums);
+ bio_endio(&dip->bio);
}
-static void btrfs_retry_endio(struct bio *bio)
+static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type)
{
- struct btrfs_retry_complete *done = bio->bi_private;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct extent_io_tree *io_tree, *failure_tree;
- struct inode *inode = done->inode;
- struct bio_vec *bvec;
- int uptodate;
- int ret;
- int i = 0;
- struct bvec_iter_all iter_all;
-
- if (bio->bi_status)
- goto end;
-
- uptodate = 1;
-
- ASSERT(bio->bi_vcnt == 1);
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
-
- io_tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
- bvec->bv_offset, done->start,
- bvec->bv_len);
- if (!ret)
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, io_tree, done->start,
- bvec->bv_page,
- btrfs_ino(BTRFS_I(inode)),
- bvec->bv_offset);
- else
- uptodate = 0;
- i++;
- }
+ BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- done->uptodate = uptodate;
-end:
- complete(&done->done);
- bio_put(bio);
+ refcount_inc(&dip->refs);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
-static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, blk_status_t err)
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+ struct btrfs_bio *bbio,
+ const bool uptodate)
{
- struct btrfs_fs_info *fs_info;
- struct bio_vec bvec;
+ struct inode *inode = dip->inode;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+ blk_status_t err = BLK_STS_OK;
struct bvec_iter iter;
- struct btrfs_retry_complete done;
- u64 start;
- u64 offset = 0;
- u32 sectorsize;
- int nr_sectors;
- unsigned int pgoff;
- int csum_pos;
- bool uptodate = (err == 0);
- int ret;
- blk_status_t status;
-
- fs_info = BTRFS_I(inode)->root->fs_info;
- sectorsize = fs_info->sectorsize;
-
- err = BLK_STS_OK;
- start = io_bio->logical;
- done.inode = inode;
- io_bio->bio.bi_iter = io_bio->iter;
+ struct bio_vec bv;
+ u32 offset;
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
- pgoff = bvec.bv_offset;
-next_block:
- if (uptodate) {
- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
- ret = __readpage_endio_check(inode, io_bio, csum_pos,
- bvec.bv_page, pgoff, start, sectorsize);
- if (likely(!ret))
- goto next;
- }
-try_again:
- done.uptodate = 0;
- done.start = start;
- init_completion(&done.done);
-
- status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- pgoff, start, start + sectorsize - 1,
- io_bio->mirror_num, btrfs_retry_endio,
- &done);
- if (status) {
- err = status;
- goto next;
- }
-
- wait_for_completion_io(&done.done);
-
- if (!done.uptodate) {
- /* We might have another mirror, so try again */
- goto try_again;
- }
-next:
- offset += sectorsize;
- start += sectorsize;
-
- ASSERT(nr_sectors);
+ if (uptodate &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
+ bv.bv_offset))) {
+ btrfs_clean_io_failure(BTRFS_I(inode), start,
+ bv.bv_page, bv.bv_offset);
+ } else {
+ int ret;
- nr_sectors--;
- if (nr_sectors) {
- pgoff += sectorsize;
- ASSERT(pgoff < PAGE_SIZE);
- goto next_block;
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
}
return err;
}
-static blk_status_t btrfs_subio_endio_read(struct inode *inode,
- struct btrfs_io_bio *io_bio, blk_status_t err)
-{
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
- if (skip_csum) {
- if (unlikely(err))
- return __btrfs_correct_data_nocsum(inode, io_bio);
- else
- return BLK_STS_OK;
- } else {
- return __btrfs_subio_endio_read(inode, io_bio, err);
- }
-}
-
-static void btrfs_endio_direct_read(struct bio *bio)
-{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
- struct bio *dio_bio;
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- blk_status_t err = bio->bi_status;
-
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
- err = btrfs_subio_endio_read(inode, io_bio, err);
-
- unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
- dip->logical_offset + dip->bytes - 1);
- dio_bio = dip->dio_bio;
-
- kfree(dip);
-
- dio_bio->bi_status = err;
- dio_end_io(dio_bio);
- btrfs_io_bio_free_csum(io_bio);
- bio_put(bio);
-}
-
-static void __endio_write_update_ordered(struct inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_extent *ordered = NULL;
- struct btrfs_workqueue *wq;
- u64 ordered_offset = offset;
- u64 ordered_bytes = bytes;
- u64 last_offset;
-
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- while (ordered_offset < offset + bytes) {
- last_offset = ordered_offset;
- if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
- NULL);
- btrfs_queue_work(wq, &ordered->work);
- }
- /*
- * If btrfs_dec_test_ordered_pending does not find any ordered
- * extent in the range, we can exit.
- */
- if (ordered_offset == last_offset)
- return;
- /*
- * Our bio might span multiple ordered extents. In this case
- * we keep going until we have accounted the whole dio.
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- }
- }
-}
-
-static void btrfs_endio_direct_write(struct bio *bio)
+static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct bio *dio_bio = dip->dio_bio;
-
- __endio_write_update_ordered(dip->inode, dip->logical_offset,
- dip->bytes, !bio->bi_status);
-
- kfree(dip);
-
- dio_bio->bi_status = bio->bi_status;
- dio_end_io(dio_bio);
- bio_put(bio);
-}
-
-static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
- struct bio *bio, u64 offset)
-{
- struct inode *inode = private_data;
- blk_status_t ret;
- ret = btrfs_csum_one_bio(inode, bio, offset, 1);
- BUG_ON(ret); /* -ENOMEM */
- return 0;
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
-static void btrfs_end_dio_bio(struct bio *bio)
+static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
{
- struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_dio_private *dip = bbio->private;
+ struct bio *bio = &bbio->bio;
blk_status_t err = bio->bi_status;
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
- bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
- if (dip->subio_endio)
- err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+ if (bio_op(bio) == REQ_OP_READ)
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
- if (err) {
- /*
- * We want to perceive the errors flag being set before
- * decrementing the reference count. We don't need a barrier
- * since atomic operations with a return value are fully
- * ordered as per atomic_t.txt
- */
- dip->errors = 1;
- }
+ if (err)
+ dip->bio.bi_status = err;
- /* if there are more bios still pending for this dio, just exit */
- if (!atomic_dec_and_test(&dip->pending_bios))
- goto out;
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
- if (dip->errors) {
- bio_io_error(dip->orig_bio);
- } else {
- dip->dio_bio->bi_status = BLK_STS_OK;
- bio_endio(dip->orig_bio);
- }
-out:
bio_put(bio);
+ btrfs_dio_private_put(dip);
}
-static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
- struct btrfs_dio_private *dip,
- struct bio *bio,
- u64 file_offset)
-{
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
- u16 csum_size;
- blk_status_t ret;
-
- /*
- * We load all the csum data we need when we submit
- * the first bio to reduce the csum tree search and
- * contention.
- */
- if (dip->logical_offset == file_offset) {
- ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset,
- NULL);
- if (ret)
- return ret;
- }
-
- if (bio == dip->orig_bio)
- return 0;
-
- file_offset -= dip->logical_offset;
- file_offset >>= inode->i_sb->s_blocksize_bits;
- csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
- io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
-
- return 0;
-}
-
-static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
- struct inode *inode, u64 file_offset, int async_submit)
+static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_private *dip = bio->bi_private;
- bool write = bio_op(bio) == REQ_OP_WRITE;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
blk_status_t ret;
- /* Check btrfs_submit_bio_hook() for rules about async submit. */
- if (async_submit)
- async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- if (!write) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto err;
- }
+ /* Save the original iter for read repair */
+ if (btrfs_op(bio) == BTRFS_MAP_READ)
+ btrfs_bio(bio)->iter = bio->bi_iter;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
- if (write && async_submit) {
- ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
- file_offset, inode,
- btrfs_submit_bio_start_direct_io);
- goto err;
- } else if (write) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ /* Check btrfs_submit_data_write_bio() for async submit rules */
+ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+ btrfs_submit_bio_start_direct_io))
+ return;
+
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
- if (ret)
- goto err;
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
} else {
- ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
- file_offset);
- if (ret)
- goto err;
+ btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+ file_offset - dip->file_offset);
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0);
-err:
- return ret;
+ btrfs_submit_bio(fs_info, bio, 0);
}
-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
+static void btrfs_submit_direct(const struct iomap_iter *iter,
+ struct bio *dio_bio, loff_t file_offset)
{
- struct inode *inode = dip->inode;
+ struct btrfs_dio_private *dip =
+ container_of(dio_bio, struct btrfs_dio_private, bio);
+ struct inode *inode = iter->inode;
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
+ BTRFS_BLOCK_GROUP_RAID56_MASK);
struct bio *bio;
- struct bio *orig_bio = dip->orig_bio;
- u64 start_sector = orig_bio->bi_iter.bi_sector;
- u64 file_offset = dip->logical_offset;
+ u64 start_sector;
int async_submit = 0;
u64 submit_len;
- int clone_offset = 0;
- int clone_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
+ u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iter->private;
+ struct extent_map *em = NULL;
- submit_len = orig_bio->bi_iter.bi_size;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- start_sector << 9, submit_len, &geom);
- if (ret)
- return -EIO;
+ dip->inode = inode;
+ dip->file_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ refcount_set(&dip->refs, 1);
+ dip->csums = NULL;
- if (geom.len >= submit_len) {
- bio = orig_bio;
- dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
- goto submit;
+ if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ unsigned int nr_sectors =
+ (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
+
+ /*
+ * Load the csums up front to reduce csum tree searches and
+ * contention when submitting bios.
+ */
+ status = BLK_STS_RESOURCE;
+ dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+ if (!dip->csums)
+ goto out_err;
+
+ status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
+ if (status != BLK_STS_OK)
+ goto out_err;
}
- /* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
- async_submit = 0;
- else
- async_submit = 1;
+ start_sector = dio_bio->bi_iter.bi_sector;
+ submit_len = dio_bio->bi_iter.bi_size;
- /* bio split */
- ASSERT(geom.len <= INT_MAX);
- atomic_inc(&dip->pending_bios);
do {
- clone_len = min_t(int, submit_len, geom.len);
+ logical = start_sector << 9;
+ em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+ if (IS_ERR(em)) {
+ status = errno_to_blk_status(PTR_ERR(em));
+ em = NULL;
+ goto out_err_em;
+ }
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+ logical, &geom);
+ if (ret) {
+ status = errno_to_blk_status(ret);
+ goto out_err_em;
+ }
+
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
- bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
- clone_len);
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
- btrfs_io_bio(bio)->logical = file_offset;
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+ btrfs_end_dio_bio, dip);
+ btrfs_bio(bio)->file_offset = file_offset;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ status = extract_ordered_extent(BTRFS_I(inode), bio,
+ file_offset);
+ if (status) {
+ bio_put(bio);
+ goto out_err;
+ }
+ }
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
- if (submit_len == 0)
- break;
/*
* Increase the count before we submit the bio so we know
* the end IO handler won't happen before we increase the
* count. Otherwise, the dip might get freed before we're
* done setting it up.
+ *
+ * We transfer the initial reference to the last bio, so we
+ * don't need to increment the reference count for the last one.
*/
- atomic_inc(&dip->pending_bios);
-
- status = btrfs_submit_dio_bio(bio, inode, file_offset,
- async_submit);
- if (status) {
- bio_put(bio);
- atomic_dec(&dip->pending_bios);
- goto out_err;
+ if (submit_len > 0) {
+ refcount_inc(&dip->refs);
+ /*
+ * If we are submitting more than one bio, submit them
+ * all asynchronously. The exception is RAID 5 or 6, as
+ * asynchronous checksums make it difficult to collect
+ * full stripe writes.
+ */
+ if (!raid56)
+ async_submit = 1;
}
+ btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- start_sector << 9, submit_len, &geom);
- if (ret)
- goto out_err;
+ free_extent_map(em);
} while (submit_len > 0);
+ return;
-submit:
- status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
- if (!status)
- return 0;
-
- bio_put(bio);
+out_err_em:
+ free_extent_map(em);
out_err:
- dip->errors = 1;
- /*
- * Before atomic variable goto zero, we must make sure dip->errors is
- * perceived to be set. This ordering is ensured by the fact that an
- * atomic operations with a return value are fully ordered as per
- * atomic_t.txt
- */
- if (atomic_dec_and_test(&dip->pending_bios))
- bio_io_error(dip->orig_bio);
-
- /* bio_end_io() will handle error, so we needn't return it */
- return 0;
+ dio_bio->bi_status = status;
+ btrfs_dio_private_put(dip);
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
-{
- struct btrfs_dio_private *dip = NULL;
- struct bio *bio = NULL;
- struct btrfs_io_bio *io_bio;
- bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
- int ret = 0;
-
- bio = btrfs_bio_clone(dio_bio);
-
- dip = kzalloc(sizeof(*dip), GFP_NOFS);
- if (!dip) {
- ret = -ENOMEM;
- goto free_ordered;
- }
-
- dip->private = dio_bio->bi_private;
- dip->inode = inode;
- dip->logical_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- bio->bi_private = dip;
- dip->orig_bio = bio;
- dip->dio_bio = dio_bio;
- atomic_set(&dip->pending_bios, 0);
- io_bio = btrfs_io_bio(bio);
- io_bio->logical = file_offset;
-
- if (write) {
- bio->bi_end_io = btrfs_endio_direct_write;
- } else {
- bio->bi_end_io = btrfs_endio_direct_read;
- dip->subio_endio = btrfs_subio_endio_read;
- }
-
- /*
- * Reset the range for unsubmitted ordered extents (to a 0 length range)
- * even if we fail to submit a bio, because in such case we do the
- * corresponding error handling below and it must not be done a second
- * time by btrfs_direct_IO().
- */
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
-
- ret = btrfs_submit_direct_hook(dip);
- if (!ret)
- return;
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
- btrfs_io_bio_free_csum(io_bio);
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+ .bio_set = &btrfs_dio_bioset,
+};
-free_ordered:
- /*
- * If we arrived here it means either we failed to submit the dip
- * or we either failed to clone the dio_bio or failed to allocate the
- * dip. If we cloned the dio_bio and allocated the dip, we can just
- * call bio_endio against our io_bio so that we get proper resource
- * cleanup if we fail to submit the dip, otherwise, we must do the
- * same as btrfs_endio_direct_[write|read] because we can't call these
- * callbacks - they require an allocated dip and a clone of dio_bio.
- */
- if (bio && dip) {
- bio_io_error(bio);
- /*
- * The end io callbacks free our dip, do the final put on bio
- * and all the cleanup and final put for dio_bio (through
- * dio_end_io()).
- */
- dip = NULL;
- bio = NULL;
- } else {
- if (write)
- __endio_write_update_ordered(inode,
- file_offset,
- dio_bio->bi_iter.bi_size,
- false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- file_offset + dio_bio->bi_iter.bi_size - 1);
+ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+{
+ struct btrfs_dio_data data;
- dio_bio->bi_status = BLK_STS_IOERR;
- /*
- * Releases and cleans up our dio_bio, no need to bio_put()
- * nor bio_endio()/bio_io_error() against dio_bio.
- */
- dio_end_io(dio_bio);
- }
- if (bio)
- bio_put(bio);
- kfree(dip);
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
+struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
{
- int seg;
- int i;
- unsigned int blocksize_mask = fs_info->sectorsize - 1;
- ssize_t retval = -EINVAL;
+ struct btrfs_dio_data data;
- if (offset & blocksize_mask)
- goto out;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
- return 0;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- goto out;
- }
- }
- retval = 0;
-out:
- return retval;
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
- struct extent_changeset *data_reserved = NULL;
- loff_t offset = iocb->ki_pos;
- size_t count = 0;
- int flags = 0;
- bool wakeup = true;
- bool relock = false;
- ssize_t ret;
-
- if (check_direct_IO(fs_info, iter, offset))
- return 0;
+ int ret;
- inode_dio_begin(inode);
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (ret)
+ return ret;
/*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
*/
- count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
- if (iov_iter_rw(iter) == WRITE) {
- /*
- * If the write DIO is beyond the EOF, we need update
- * the isize, but it is protected by i_mutex. So we can
- * not unlock the i_mutex at this case.
- */
- if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
- inode_unlock(inode);
- relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- offset, count);
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
- down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
- }
-
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
- up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve, true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(inode,
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode, data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+ return ret;
}
-out:
- if (wakeup)
- inode_dio_end(inode);
- if (relock)
- inode_lock(inode);
- extent_changeset_free(data_reserved);
- return ret;
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
-#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
-
-static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+static int btrfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- int ret;
-
- ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
- if (ret)
- return ret;
-
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_writepages(mapping, wbc);
}
-int btrfs_readpage(struct file *file, struct page *page)
+static void btrfs_readahead(struct readahead_control *rac)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btrfs_get_extent, 0);
+ extent_readahead(rac);
}
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+/*
+ * For release_folio() and invalidate_folio() we have a race window where
+ * folio_end_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
{
- struct inode *inode = page->mapping->host;
- int ret;
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
/*
- * If we are under memory pressure we will call this directly from the
- * VM, we need to make sure we have the inode referenced for the ordered
- * extent. If not just return like we didn't do anything.
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
*/
- if (!igrab(inode)) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
- ret = extent_write_full_page(page, wbc);
- btrfs_add_delayed_iput(inode);
- return ret;
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
}
-static int btrfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- return extent_writepages(mapping, wbc);
+ int ret = try_release_extent_mapping(&folio->page, gfp_flags);
+
+ if (ret == 1) {
+ wait_subpage_spinlock(&folio->page);
+ clear_page_extent_mapped(&folio->page);
+ }
+ return ret;
}
-static int
-btrfs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- return extent_readpages(mapping, pages, nr_pages);
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
+ return __btrfs_release_folio(folio, gfp_flags);
}
-static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+#ifdef CONFIG_MIGRATION
+static int btrfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
{
- int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
+ int ret = filemap_migrate_folio(mapping, dst, src, mode);
+
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (folio_test_ordered(src)) {
+ folio_clear_ordered(src);
+ folio_set_ordered(dst);
}
- return ret;
-}
-static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
-{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
- return __btrfs_releasepage(page, gfp_flags);
+ return MIGRATEPAGE_SUCCESS;
}
+#else
+#define btrfs_migrate_folio NULL
+#endif
-static void btrfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
- u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_SIZE - 1;
- u64 start;
- u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = page_start + folio_size(folio) - 1;
+ u64 cur;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
- * we have the page locked, so new writeback can't start,
- * and the dirty bit won't be cleared while we are here.
+ * We have folio locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this folio.
+ *
+ * But already submitted bio can still be finished on this folio.
+ * Furthermore, endio function won't skip folio which has Ordered
+ * (Private2) already cleared, so it's possible for endio and
+ * invalidate_folio to do the same ordered extent accounting twice
+ * on one folio.
*
- * Wait for IO on this page so that we can safely clear
- * the PagePrivate2 bit and do ordered accounting
+ * So here we wait for any submitted bios to finish, so that we won't
+ * do double ordered extent accounting on the same folio.
*/
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
+ wait_subpage_spinlock(&folio->page);
- tree = &BTRFS_I(inode)->io_tree;
- if (offset) {
- btrfs_releasepage(page, GFP_NOFS);
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full folio, we don't need to and
+ * shouldn't clear page extent mapped, as folio->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that invalidate the full folio even the range doesn't
+ * cover the full folio, like invalidating the last folio, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == folio_size(folio))) {
+ btrfs_release_folio(folio, GFP_NOFS);
return;
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, &cached_state);
-again:
- start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
- if (ordered) {
- end = min(page_end,
- ordered->file_offset + ordered->num_bytes - 1);
+ lock_extent(tree, page_start, page_end, &cached_state);
+
+ cur = page_start;
+ while (cur < page_end) {
+ struct btrfs_ordered_extent *ordered;
+ u64 range_end;
+ u32 range_len;
+ u32 extra_flags = 0;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ page_end + 1 - cur);
+ if (!ordered) {
+ range_end = page_end;
+ /*
+ * No ordered extent covering this range, we are safe
+ * to delete all extent states in the range.
+ */
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
+ goto next;
+ }
+ if (ordered->file_offset > cur) {
+ /*
+ * There is a range between [cur, oe->file_offset) not
+ * covered by any ordered extent.
+ * We are safe to delete all extent states, and handle
+ * the ordered extent in the next iteration.
+ */
+ range_end = ordered->file_offset - 1;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
+ goto next;
+ }
+
+ range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+ page_end);
+ ASSERT(range_end + 1 - cur < U32_MAX);
+ range_len = range_end + 1 - cur;
+ if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ /*
+ * If Ordered (Private2) is cleared, it means endio has
+ * already been executed for the range.
+ * We can't delete the extent states as
+ * btrfs_finish_ordered_io() may still use some of them.
+ */
+ goto next;
+ }
+ btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+
/*
- * IO on this page will never be started, so we need
- * to account for any ordered extents now
+ * IO on this page will never be started, so we need to account
+ * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
+ * here, must leave that up for the ordered extent completion.
+ *
+ * This will also unlock the range for incoming
+ * btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ clear_extent_bit(tree, cur, range_end,
+ EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state);
- /*
- * whoever cleared the private bit is responsible
- * for the finish_ordered_io
- */
- if (TestClearPagePrivate2(page)) {
- struct btrfs_ordered_inode_tree *tree;
- u64 new_len;
+ EXTENT_DEFRAG, &cached_state);
- tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&inode->ordered_tree.lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
- spin_lock_irq(&tree->lock);
- set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- new_len = start - ordered->file_offset;
- if (new_len < ordered->truncated_len)
- ordered->truncated_len = new_len;
- spin_unlock_irq(&tree->lock);
-
- if (btrfs_dec_test_ordered_pending(inode, &ordered,
- start,
- end - start + 1, 1))
- btrfs_finish_ordered_io(ordered);
+ /*
+ * If the ordered extent has finished, we're safe to delete all
+ * the extent states of the range, otherwise
+ * btrfs_finish_ordered_io() will get executed by endio for
+ * other pages, so we can't delete extent states.
+ */
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ cur, range_end + 1 - cur)) {
+ btrfs_finish_ordered_io(ordered);
+ /*
+ * The ordered extent has finished, now we're again
+ * safe to delete all extent states of the range.
+ */
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
}
- btrfs_put_ordered_extent(ordered);
+next:
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ /*
+ * Qgroup reserved space handler
+ * Sector(s) here will be either:
+ *
+ * 1) Already written to disk or bio already finished
+ * Then its QGROUP_RESERVED bit in io_tree is already cleared.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the
+ * QGROUP_RESERVED bit of its io_tree, and free the qgroup
+ * reserved data space.
+ * Since the IO will never happen for this page.
+ */
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
if (!inode_evicting) {
- cached_state = NULL;
- lock_extent_bits(tree, start, end,
- &cached_state);
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
+ extra_flags, &cached_state);
}
-
- start = end + 1;
- if (start < page_end)
- goto again;
+ cur = range_end + 1;
}
-
/*
- * Qgroup reserved space handler
- * Page here will be either
- * 1) Already written to disk
- * In this case, its reserved space is released from data rsv map
- * and will be freed by delayed_ref handler finally.
- * So even we call qgroup_free_data(), it won't decrease reserved
- * space.
- * 2) Not written to disk
- * This means the reserved space should be freed here. However,
- * if a truncate invalidates the page (by clearing PageDirty)
- * and the page is accounted for while allocating extent
- * in btrfs_check_data_free_space() we let delayed_ref to
- * free the entire extent.
- */
- if (PageDirty(page))
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
- if (!inode_evicting) {
- clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
- &cached_state);
-
- __btrfs_releasepage(page, GFP_NOFS);
- }
-
- ClearPageChecked(page);
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- }
+ * We have iterated through all ordered extents of the page, the page
+ * should not have Ordered (Private2) anymore, or the above iteration
+ * did something wrong.
+ */
+ ASSERT(!folio_test_ordered(folio));
+ btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ if (!inode_evicting)
+ __btrfs_release_folio(folio, GFP_NOFS);
+ clear_page_extent_mapped(&folio->page);
}
/*
@@ -8410,7 +8400,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- char *kaddr;
unsigned long zero_start;
loff_t size;
vm_fault_t ret;
@@ -8432,12 +8421,12 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* Reserving delalloc space after obtaining the page lock can lead to
* deadlock. For example, if a dirty page is locked by this function
* and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepage() function could
+ * dirty page write out, then the btrfs_writepages() function could
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- reserved_space);
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
+ page_start, reserved_space);
if (!ret2) {
ret2 = file_update_time(vmf->vma->vm_file);
reserved = 1;
@@ -8451,6 +8440,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
lock_page(page);
size = i_size_read(inode);
@@ -8461,8 +8451,13 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, &cached_state);
- set_page_extent_mapped(page);
+ lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
/*
* we can't set the delalloc bits if there are pending ordered
@@ -8471,10 +8466,10 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8484,9 +8479,9 @@ again:
fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE - reserved_space,
- true);
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved, page_start,
+ PAGE_SIZE - reserved_space, true);
}
}
@@ -8499,13 +8494,12 @@ again:
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
- ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -8516,21 +8510,17 @@ again:
else
zero_start = PAGE_SIZE;
- if (zero_start != PAGE_SIZE) {
- kaddr = kmap(page);
- memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
- flush_dcache_page(page);
- kunmap(page);
- }
- ClearPageChecked(page);
- set_page_dirty(page);
- SetPageUptodate(page);
+ if (zero_start != PAGE_SIZE)
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
@@ -8539,9 +8529,10 @@ again:
out_unlock:
unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
@@ -8551,6 +8542,12 @@ out_noreserve:
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
+ struct btrfs_truncate_control control = {
+ .inode = BTRFS_I(inode),
+ .ino = btrfs_ino(BTRFS_I(inode)),
+ .min_type = BTRFS_EXTENT_DATA_KEY,
+ .clear_extent_range = true,
+ };
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
@@ -8598,7 +8595,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
- rsv->failfast = 1;
+ rsv->failfast = true;
/*
* 1 for the truncate slack space
@@ -8615,25 +8612,38 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
min_size, false);
BUG_ON(ret);
- /*
- * So if we truncate and then write and fsync we normally would just
- * write the extents that changed, which is a problem if we need to
- * first truncate that entire inode. So set this flag so we write out
- * all of the extents in the inode to the sync log so we're completely
- * safe.
- */
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
trans->block_rsv = rsv;
while (1) {
- ret = btrfs_truncate_inode_items(trans, root, inode,
- inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
+ struct extent_state *cached_state = NULL;
+ const u64 new_size = inode->i_size;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+
+ control.new_size = new_size;
+ lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+ /*
+ * We want to drop from the next block forward in case this new
+ * size is not block aligned since we will be keeping the last
+ * block of the extent just the way it is.
+ */
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, false);
+
+ ret = btrfs_truncate_inode_items(trans, root, &control);
+
+ inode_sub_bytes(inode, control.sub_bytes);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
break;
@@ -8647,7 +8657,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1);
+ btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
@@ -8656,15 +8666,15 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/*
* We can't call btrfs_truncate_block inside a trans handle as we could
- * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
- * we've truncated everything except the last little bit, and can do
- * btrfs_truncate_block and then update the disk_i_size.
+ * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
+ * know we've truncated everything except the last little bit, and can
+ * do btrfs_truncate_block and then update the disk_i_size.
*/
- if (ret == NEED_TRUNCATE_BLOCK) {
+ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -8672,14 +8682,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
if (trans) {
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret2 && !ret)
ret = ret2;
@@ -8690,45 +8700,43 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
out:
btrfs_free_block_rsv(fs_info, rsv);
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ *
+ * If no extents were dropped or trimmed we don't need to force the next
+ * fsync to truncate all the inode's items from the log and re-log them
+ * all. This means the truncate operation did not change the file size,
+ * or changed it to a smaller size but there was only an implicit hole
+ * between the old i_size and the new i_size, and there were no prealloc
+ * extents beyond i_size to drop.
+ */
+ if (control.extents_found > 0)
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
return ret;
}
-/*
- * create a new subvolume directory/inode (helper for the ioctl).
- */
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid)
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
struct inode *inode;
- int err;
- u64 index = 0;
-
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
- new_dirid, new_dirid,
- S_IFDIR | (~current_umask() & S_IRWXUGO),
- &index);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- inode->i_op = &btrfs_dir_inode_operations;
- inode->i_fop = &btrfs_dir_file_operations;
-
- set_nlink(inode, 1);
- btrfs_i_size_write(BTRFS_I(inode), 0);
- unlock_new_inode(inode);
- err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
- if (err)
- btrfs_err(new_root->fs_info,
- "error inheriting subvolume %llu properties: %d",
- new_root->root_key.objectid, err);
-
- err = btrfs_update_inode(trans, new_root, inode);
-
- iput(inode);
- return err;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ /*
+ * Subvolumes don't inherit the sgid bit or the parent's gid if
+ * the parent's sgid bit is set. This is probably a bug.
+ */
+ inode_init_owner(mnt_userns, inode, NULL,
+ S_IFDIR | (~current_umask() & S_IRWXUGO));
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ }
+ return inode;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -8737,7 +8745,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -8751,13 +8759,16 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_reflink_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
+ spin_lock_init(&ei->io_failure_lock);
ei->outstanding_extents = 0;
if (sb->s_magic != BTRFS_TEST_MAGIC)
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8774,17 +8785,16 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
- extent_io_tree_init(fs_info, &ei->io_failure_tree,
- IO_TREE_INODE_IO_FAILURE, inode);
- ei->io_tree.track_uptodate = true;
- ei->io_failure_tree.track_uptodate = true;
+ extent_io_tree_init(fs_info, &ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT, NULL);
+ ei->io_failure_tree = RB_ROOT;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
- init_rwsem(&ei->dio_sem);
+ init_rwsem(&ei->i_mmap_lock);
return inode;
}
@@ -8792,7 +8802,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
@@ -8802,21 +8812,24 @@ void btrfs_free_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
+ bool freespace_inode;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ if (!S_ISDIR(vfs_inode->i_mode)) {
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ }
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8826,14 +8839,24 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root)
return;
+ /*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
+
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
+
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -8841,7 +8864,9 @@ void btrfs_destroy_inode(struct inode *inode)
}
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -8860,7 +8885,7 @@ int btrfs_drop_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+ struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
}
@@ -8872,6 +8897,7 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
+ bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
kmem_cache_destroy(btrfs_path_cachep);
@@ -8908,23 +8934,31 @@ int __init btrfs_init_cachep(void)
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
- SLAB_RED_ZONE, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep)
goto fail;
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bio),
+ BIOSET_NEED_BVECS))
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
return -ENOMEM;
}
-static int btrfs_getattr(const struct path *path, struct kstat *stat,
+static int btrfs_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
u64 delalloc_bytes;
+ u64 inode_bytes;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -8937,19 +8971,22 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (bi_flags & BTRFS_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
+ inode_bytes = inode_get_bytes(inode);
spin_unlock(&BTRFS_I(inode)->lock);
- stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+ stat->blocks = (ALIGN(inode_bytes, blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
return 0;
}
@@ -8961,53 +8998,79 @@ static int btrfs_rename_exchange(struct inode *old_dir,
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
- bool root_log_pinned = false;
- bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
+ int ret2;
+ bool need_abort = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
- * should cover the worst case number of items we'll modify.
+ * For each inode:
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ * 1 to update parent inode
+ *
+ * If the parents are the same, we only need to account for one
*/
- trans = btrfs_start_transaction(root, 12);
+ trans_num_items = (old_dir == new_dir ? 9 : 10);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode item
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ trans_num_items += 4;
+ else
+ trans_num_items += 3;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
- if (dest != root)
- btrfs_record_root_in_trans(trans, dest);
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
/*
* We need to find a free sequence number both in the source and
@@ -9028,8 +9091,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9038,6 +9099,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9045,16 +9107,17 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
@@ -9062,8 +9125,10 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
- old_dir->i_ctime = old_dir->i_mtime = ctime;
- new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_dir->i_mtime = ctime;
+ old_dir->i_ctime = ctime;
+ new_dir->i_mtime = ctime;
+ new_dir->i_ctime = ctime;
old_inode->i_ctime = ctime;
new_inode->i_ctime = ctime;
@@ -9078,12 +9143,13 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &old_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, old_inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9094,12 +9160,13 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
- ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
- new_dentry->d_name.len);
+ new_dentry->d_name.len,
+ &new_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, dest, new_inode);
+ ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9127,175 +9194,77 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;
- if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(dest);
+
+ /* Do the log updates for all inodes. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
+
+ /* Now unpin the logs. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
out_fail:
- /*
- * If we have pinned a log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
- */
- if (ret && (root_log_pinned || dest_log_pinned)) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(trans);
-
- if (root_log_pinned) {
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
- }
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
-static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *dir,
- struct dentry *dentry)
+static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
- int ret;
struct inode *inode;
- u64 objectid;
- u64 index;
-
- ret = btrfs_find_free_ino(root, &objectid);
- if (ret)
- return ret;
-
- inode = btrfs_new_inode(trans, root, dir,
- dentry->d_name.name,
- dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)),
- objectid,
- S_IFCHR | WHITEOUT_MODE,
- &index);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- return ret;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ inode_init_owner(mnt_userns, inode, dir,
+ S_IFCHR | WHITEOUT_MODE);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
}
-
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode,
- WHITEOUT_DEV);
-
- ret = btrfs_init_inode_security(trans, inode, dir,
- &dentry->d_name);
- if (ret)
- goto out;
-
- ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (ret)
- goto out;
-
- ret = btrfs_update_inode(trans, root, inode);
-out:
- unlock_new_inode(inode);
- if (ret)
- inode_dec_link_count(inode);
- iput(inode);
-
- return ret;
+ return inode;
}
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_new_inode_args whiteout_args = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ };
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
- bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9339,31 +9308,67 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
- /* close the racy window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (flags & RENAME_WHITEOUT) {
+ whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
+ if (!whiteout_args.inode)
+ return -ENOMEM;
+ ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
+ if (ret)
+ goto out_whiteout_inode;
+ } else {
+ /* 1 to update the old parent inode. */
+ trans_num_items = 1;
+ }
+
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* Close the race window with snapshot create/destroy ioctl */
down_read(&fs_info->subvol_sem);
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume they are normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- * If our rename has the whiteout flag, we need more 5 units for the
- * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
- * when selinux is enabled).
- */
- trans_num_items = 11;
- if (flags & RENAME_WHITEOUT)
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ */
+ trans_num_items += 4;
+ /* 1 to update new parent inode if it's not the same as the old parent */
+ if (new_dir != old_dir)
+ trans_num_items++;
+ if (new_inode) {
+ /*
+ * 1 to update inode
+ * 1 to remove inode ref
+ * 1 to remove dir item
+ * 1 to remove dir index
+ * 1 to possibly add orphan item
+ */
trans_num_items += 5;
+ }
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
- if (dest != root)
- btrfs_record_root_in_trans(trans, dest);
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
if (ret)
@@ -9374,8 +9379,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9388,9 +9391,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- old_inode->i_ctime = current_time(old_dir);
+ old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_ctime = old_dir->i_mtime;
+ new_dir->i_mtime = old_dir->i_mtime;
+ new_dir->i_ctime = old_dir->i_mtime;
+ old_inode->i_ctime = old_dir->i_mtime;
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9399,12 +9404,13 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
- ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, old_inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9419,7 +9425,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
- ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+ ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -9444,90 +9450,54 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, old_dir,
- old_dentry);
-
+ ret = btrfs_create_new_inode(trans, &whiteout_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
+ } else {
+ unlock_new_inode(whiteout_args.inode);
+ iput(whiteout_args.inode);
+ whiteout_args.inode = NULL;
}
}
out_fail:
- /*
- * If we have pinned the log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
- */
- if (ret && log_pinned) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(trans);
-
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
-
+ if (flags & RENAME_WHITEOUT)
+ btrfs_new_inode_args_destroy(&whiteout_args);
+out_whiteout_inode:
+ if (flags & RENAME_WHITEOUT)
+ iput(whiteout_args.inode);
return ret;
}
-static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
+ int ret;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
- new_dentry);
+ ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+ else
+ ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
+
+ btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return ret;
}
struct btrfs_delalloc_work {
@@ -9574,7 +9544,9 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root,
+ struct writeback_control *wbc, bool snapshot,
+ bool in_reclaim_context)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9582,6 +9554,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
struct list_head works;
struct list_head splice;
int ret = 0;
+ bool full_flush = wbc->nr_to_write == LONG_MAX;
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
@@ -9595,6 +9568,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
list_move_tail(&binode->delalloc_inodes,
&root->delalloc_inodes);
+
+ if (in_reclaim_context &&
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ continue;
+
inode = igrab(&binode->vfs_inode);
if (!inode) {
cond_resched_lock(&root->delalloc_lock);
@@ -9605,18 +9583,22 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&binode->runtime_flags);
- work = btrfs_alloc_delalloc_work(inode);
- if (!work) {
- iput(inode);
- ret = -ENOMEM;
- goto out;
+ if (full_flush) {
+ work = btrfs_alloc_delalloc_work(inode);
+ if (!work) {
+ iput(inode);
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ } else {
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
+ btrfs_add_delayed_iput(inode);
+ if (ret || wbc->nr_to_write <= 0)
+ goto out;
}
- list_add_tail(&work->list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -9638,27 +9620,36 @@ out:
return ret;
}
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
+ bool in_reclaim_context)
{
+ struct writeback_control wbc = {
+ .nr_to_write = nr,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_root *root;
struct list_head splice;
int ret;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
INIT_LIST_HEAD(&splice);
@@ -9666,24 +9657,26 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice) && nr) {
+ while (!list_empty(&splice)) {
+ /*
+ * Reset nr_to_write here so we know that we're doing a full
+ * flush.
+ */
+ if (nr == LONG_MAX)
+ wbc.nr_to_write = LONG_MAX;
+
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->delalloc_root,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
- btrfs_put_fs_root(root);
- if (ret < 0)
+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ btrfs_put_root(root);
+ if (ret < 0 || wbc.nr_to_write <= 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -9699,18 +9692,21 @@ out:
return ret;
}
-static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
+static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ };
+ unsigned int trans_num_items;
int err;
- u64 objectid;
- u64 index = 0;
int name_len;
int datasize;
unsigned long ptr;
@@ -9721,49 +9717,40 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return -ENAMETOOLONG;
- /*
- * 2 items for inode item and ref
- * 2 items for dir items
- * 1 item for updating parent inode item
- * 1 item for the inline extent item
- * 1 item for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 7);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &btrfs_aops;
+ btrfs_i_size_write(BTRFS_I(inode), name_len);
+ inode_set_bytes(inode, name_len);
- err = btrfs_find_free_ino(root, &objectid);
+ new_inode_args.inode = inode;
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
+ /* 1 additional item for the inline extent */
+ trans_num_items++;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
- objectid, S_IFLNK|S_IRWXUGO, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
- inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
if (err)
- goto out_unlock;
+ goto out;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- goto out_unlock;
+ btrfs_abort_transaction(trans, err);
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
key.offset = 0;
@@ -9772,8 +9759,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
+ btrfs_abort_transaction(trans, err);
btrfs_free_path(path);
- goto out_unlock;
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -9791,41 +9781,102 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode_set_bytes(inode, name_len);
- btrfs_i_size_write(BTRFS_I(inode), name_len);
- err = btrfs_update_inode(trans, root, inode);
- /*
- * Last step, add directory indexes for our symlink inode. This is the
- * last step to avoid extra cleanup of these indexes if an error happens
- * elsewhere above.
- */
- if (!err)
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (err)
- goto out_unlock;
-
d_instantiate_new(dentry, inode);
-
-out_unlock:
+ err = 0;
+out:
btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
+ struct btrfs_inode *inode,
+ struct btrfs_key *ins,
+ u64 file_offset)
+{
+ struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
+ u64 start = ins->objectid;
+ u64 len = ins->offset;
+ int qgroup_released;
+ int ret;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
+ /* Encryption and other encoding is reserved and all 0 */
+
+ qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
+ if (qgroup_released < 0)
+ return ERR_PTR(qgroup_released);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, inode,
+ file_offset, &stack_fi,
+ true, qgroup_released);
+ if (ret)
+ goto free_qgroup;
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.update_times = true;
+ extent_info.qgroup_reserved = qgroup_released;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto free_qgroup;
+ }
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ goto free_qgroup;
+ return trans;
+
+free_qgroup:
+ /*
+ * We have released qgroup data range at the beginning of the function,
+ * and normally qgroup_released bytes will be freed when committing
+ * transaction.
+ * But if we error out early, we have to free what we have released
+ * or we leak qgroup data reservation.
+ */
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid, qgroup_released,
+ BTRFS_QGROUP_RSV_DATA);
+ return ERR_PTR(ret);
+}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -9841,14 +9892,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -9860,11 +9903,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -9874,30 +9914,29 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
* clear_offset by our extent size.
*/
clear_offset += ins.offset;
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_reserved_file_extent(trans, inode,
- cur_offset, ins.objectid,
- ins.offset, ins.offset,
- ins.offset, 0, 0, 0,
- BTRFS_FILE_EXTENT_PREALLOC);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
+ &ins, cur_offset);
+ /*
+ * Now that we inserted the prealloc extent we can finally
+ * decrement the number of reservations in the block group.
+ * If we did it before, we could race with relocation and have
+ * relocation miss the reserved extent, making it fail later.
+ */
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset -1, 0);
-
em = alloc_extent_map();
if (!em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
+ cur_offset + ins.offset - 1, false);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -9911,16 +9950,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST)
- break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset - 1,
- 0);
- }
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
free_extent_map(em);
next:
num_bytes -= ins.offset;
@@ -9938,10 +9968,10 @@ next:
else
i_size = cur_offset;
i_size_write(inode, i_size);
- btrfs_ordered_update_i_size(inode, i_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9950,11 +9980,13 @@ next:
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
end - clear_offset + 1);
return ret;
}
@@ -9977,12 +10009,8 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
-static int btrfs_set_page_dirty(struct page *page)
-{
- return __set_page_dirty_nobuffers(page);
-}
-
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
umode_t mode = inode->i_mode;
@@ -9994,90 +10022,813 @@ static int btrfs_permission(struct inode *inode, int mask)
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
-static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- u64 objectid;
- u64 index;
- int ret = 0;
-
- /*
- * 5 units required for adding orphan entry
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_find_free_ino(root, &objectid);
- if (ret)
- goto out;
-
- inode = btrfs_new_inode(trans, root, dir, NULL, 0,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- inode = NULL;
- goto out;
- }
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = file->f_path.dentry,
+ .orphan = true,
+ };
+ unsigned int trans_num_items;
+ int ret;
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
-
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
- ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ new_inode_args.inode = inode;
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (ret)
- goto out;
+ goto out_inode;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- goto out;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret)
- goto out;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
/*
- * We set number of links to 0 in btrfs_new_inode(), and here we set
- * it to 1 because d_tmpfile() will issue a warning if the count is 0,
- * through:
+ * We set number of links to 0 in btrfs_create_new_inode(), and here we
+ * set it to 1 because d_tmpfile() will issue a warning if the count is
+ * 0, through:
*
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
- unlock_new_inode(inode);
- mark_inode_dirty(inode);
-out:
+
+ if (!ret) {
+ d_tmpfile(file, inode);
+ unlock_new_inode(inode);
+ mark_inode_dirty(inode);
+ }
+
btrfs_end_transaction(trans);
- if (ret && inode)
- discard_new_inode(inode);
btrfs_btree_balance_dirty(fs_info);
- return ret;
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (ret)
+ iput(inode);
+ return finish_open_simple(file, ret);
}
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct inode *inode = tree->private_data;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
+ u32 len;
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
+
+ btrfs_page_set_writeback(fs_info, page, start, len);
put_page(page);
index++;
}
}
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type)
+{
+ switch (compress_type) {
+ case BTRFS_COMPRESS_NONE:
+ return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+ case BTRFS_COMPRESS_ZLIB:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+ case BTRFS_COMPRESS_LZO:
+ /*
+ * The LZO format depends on the sector size. 64K is the maximum
+ * sector size that we support.
+ */
+ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+ return -EINVAL;
+ return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+ (fs_info->sectorsize_bits - 12);
+ case BTRFS_COMPRESS_ZSTD:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+ default:
+ return -EUCLEAN;
+ }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+ struct kiocb *iocb,
+ struct iov_iter *iter, u64 start,
+ u64 lockend,
+ struct extent_state **cached_state,
+ u64 extent_start, size_t count,
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *item;
+ u64 ram_bytes;
+ unsigned long ptr;
+ void *tmp;
+ ssize_t ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ extent_start, 0);
+ if (ret) {
+ if (ret > 0) {
+ /* The extent item disappeared? */
+ ret = -EIO;
+ }
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ encoded->len = min_t(u64, extent_start + ram_bytes,
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, item));
+ if (ret < 0)
+ goto out;
+ encoded->compression = ret;
+ if (encoded->compression) {
+ size_t inline_size;
+
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]);
+ if (inline_size > count) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+ count = inline_size;
+ encoded->unencoded_len = ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - extent_start;
+ } else {
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ ptr += iocb->ki_pos - extent_start;
+ }
+
+ tmp = kmalloc(count, GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(leaf, tmp, ptr, count);
+ btrfs_release_path(path);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ ret = copy_to_iter(tmp, count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ kfree(tmp);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_encoded_read_private {
+ struct btrfs_inode *inode;
+ u64 file_offset;
+ wait_queue_head_t wait;
+ atomic_t pending;
+ blk_status_t status;
+ bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+ struct bio *bio, int mirror_num)
+{
+ struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ blk_status_t ret;
+
+ if (!priv->skip_csum) {
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
+
+ atomic_inc(&priv->pending);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return BLK_STS_OK;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+ const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+ struct btrfs_encoded_read_private *priv = bbio->private;
+ struct btrfs_inode *inode = priv->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ u32 bio_offset = 0;
+
+ if (priv->skip_csum || !uptodate)
+ return bbio->bio.bi_status;
+
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ unsigned int i, nr_sectors, pgoff;
+
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+ for (i = 0; i < nr_sectors; i++) {
+ ASSERT(pgoff < PAGE_SIZE);
+ if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff))
+ return BLK_STS_IOERR;
+ bio_offset += sectorsize;
+ pgoff += sectorsize;
+ }
+ }
+ return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
+{
+ struct btrfs_encoded_read_private *priv = bbio->private;
+ blk_status_t status;
+
+ status = btrfs_encoded_read_verify_csum(bbio);
+ if (status) {
+ /*
+ * The memory barrier implied by the atomic_dec_return() here
+ * pairs with the memory barrier implied by the
+ * atomic_dec_return() or io_wait_event() in
+ * btrfs_encoded_read_regular_fill_pages() to ensure that this
+ * write is observed before the load of status in
+ * btrfs_encoded_read_regular_fill_pages().
+ */
+ WRITE_ONCE(priv->status, status);
+ }
+ if (!atomic_dec_return(&priv->pending))
+ wake_up(&priv->wait);
+ btrfs_bio_free_csum(bbio);
+ bio_put(&bbio->bio);
+}
+
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size, struct page **pages)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_encoded_read_private priv = {
+ .inode = inode,
+ .file_offset = file_offset,
+ .pending = ATOMIC_INIT(1),
+ .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ };
+ unsigned long i = 0;
+ u64 cur = 0;
+ int ret;
+
+ init_waitqueue_head(&priv.wait);
+ /*
+ * Submit bios for the extent, splitting due to bio or stripe limits as
+ * necessary.
+ */
+ while (cur < disk_io_size) {
+ struct extent_map *em;
+ struct btrfs_io_geometry geom;
+ struct bio *bio = NULL;
+ u64 remaining;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+ disk_io_size - cur);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ } else {
+ ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+ disk_bytenr + cur, &geom);
+ free_extent_map(em);
+ }
+ if (ret) {
+ WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+ break;
+ }
+ remaining = min(geom.len, disk_io_size - cur);
+ while (bio || remaining) {
+ size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+ if (!bio) {
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+ btrfs_encoded_read_endio,
+ &priv);
+ bio->bi_iter.bi_sector =
+ (disk_bytenr + cur) >> SECTOR_SHIFT;
+ }
+
+ if (!bytes ||
+ bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+ blk_status_t status;
+
+ status = submit_encoded_read_bio(inode, bio, 0);
+ if (status) {
+ WRITE_ONCE(priv.status, status);
+ bio_put(bio);
+ goto out;
+ }
+ bio = NULL;
+ continue;
+ }
+
+ i++;
+ cur += bytes;
+ remaining -= bytes;
+ }
+ }
+
+out:
+ if (atomic_dec_return(&priv.pending))
+ io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+ struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ unsigned long nr_pages, i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages);
+ if (ret)
+ goto out;
+
+ unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ if (compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+ page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < count) {
+ size_t bytes = min_t(size_t, count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(pages[i], page_offset, bytes,
+ iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = count;
+out:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ ssize_t ret;
+ size_t count = iov_iter_count(iter);
+ u64 start, lockend, disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ bool unlocked = false;
+
+ file_accessed(iocb->ki_filp);
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+ if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return 0;
+ }
+ start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+ /*
+ * We don't know how long the extent containing iocb->ki_pos is, but if
+ * it's compressed we know that it won't be longer than this.
+ */
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent(io_tree, start, lockend, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ cond_resched();
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_extent;
+ }
+
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ u64 extent_start = em->start;
+
+ /*
+ * For inline extents we get everything we need out of the
+ * extent item.
+ */
+ free_extent_map(em);
+ em = NULL;
+ ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+ &cached_state, extent_start,
+ count, encoded, &unlocked);
+ goto out;
+ }
+
+ /*
+ * We only want to return up to EOF even if the extent extends beyond
+ * that.
+ */
+ encoded->len = min_t(u64, extent_map_end(em),
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ disk_bytenr = EXTENT_MAP_HOLE;
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ disk_bytenr = em->block_start;
+ /*
+ * Bail if the buffer isn't large enough to return the whole
+ * compressed extent.
+ */
+ if (em->block_len > count) {
+ ret = -ENOBUFS;
+ goto out_em;
+ }
+ disk_io_size = em->block_len;
+ count = em->block_len;
+ encoded->unencoded_len = em->ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ em->compress_type);
+ if (ret < 0)
+ goto out_em;
+ encoded->compression = ret;
+ } else {
+ disk_bytenr = em->block_start + (start - em->start);
+ if (encoded->len > count)
+ encoded->len = count;
+ /*
+ * Don't read beyond what we locked. This also limits the page
+ * allocations that we'll do.
+ */
+ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + disk_io_size - iocb->ki_pos;
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ }
+ free_extent_map(em);
+ em = NULL;
+
+ if (disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlocked = true;
+ ret = iov_iter_zero(count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ } else {
+ ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ encoded->compression,
+ &unlocked);
+ }
+
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+out_em:
+ free_extent_map(em);
+out_unlock_extent:
+ if (!unlocked)
+ unlock_extent(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+ if (!unlocked)
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_changeset *data_reserved = NULL;
+ struct extent_state *cached_state = NULL;
+ int compression;
+ size_t orig_count;
+ u64 start, end;
+ u64 num_bytes, ram_bytes, disk_num_bytes;
+ unsigned long nr_pages, i;
+ struct page **pages;
+ struct btrfs_key ins;
+ bool extent_reserved = false;
+ struct extent_map *em;
+ ssize_t ret;
+
+ switch (encoded->compression) {
+ case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
+ compression = BTRFS_COMPRESS_ZLIB;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
+ compression = BTRFS_COMPRESS_ZSTD;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
+ /* The sector size must match for LZO. */
+ if (encoded->compression -
+ BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
+ fs_info->sectorsize_bits)
+ return -EINVAL;
+ compression = BTRFS_COMPRESS_LZO;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
+ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+ orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
+ return -EINVAL;
+
+ /*
+ * The compressed data must be smaller than the decompressed data.
+ *
+ * It's of course possible for data to compress to larger or the same
+ * size, but the buffered I/O path falls back to no compression for such
+ * data, and we don't want to break any assumptions by creating these
+ * extents.
+ *
+ * Note that this is less strict than the current check we have that the
+ * compressed data must be at least one sector smaller than the
+ * decompressed data. We only want to enforce the weaker requirement
+ * from old kernels that it is at least one byte smaller.
+ */
+ if (orig_count >= encoded->unencoded_len)
+ return -EINVAL;
+
+ /* The extent must start on a sector boundary. */
+ start = iocb->ki_pos;
+ if (!IS_ALIGNED(start, fs_info->sectorsize))
+ return -EINVAL;
+
+ /*
+ * The extent must end on a sector boundary. However, we allow a write
+ * which ends at or extends i_size to have an unaligned length; we round
+ * up the extent size and set i_size to the unaligned end.
+ */
+ if (start + encoded->len < inode->vfs_inode.i_size &&
+ !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* Finally, the offset in the unencoded data must be sector-aligned. */
+ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
+ return -EINVAL;
+
+ num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
+ ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+ end = start + num_bytes - 1;
+
+ /*
+ * If the extent cannot be inline, the compressed data on disk must be
+ * sector-aligned. For convenience, we extend it with zeroes if it
+ * isn't.
+ */
+ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
+ nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+ char *kaddr;
+
+ pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ kaddr = kmap_local_page(pages[i]);
+ if (copy_from_iter(kaddr, bytes, from) != bytes) {
+ kunmap_local(kaddr);
+ ret = -EFAULT;
+ goto out_pages;
+ }
+ if (bytes < PAGE_SIZE)
+ memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+ kunmap_local(kaddr);
+ }
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ if (ret)
+ goto out_pages;
+ ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ if (ret)
+ goto out_pages;
+ lock_extent(io_tree, start, end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
+ if (!ordered &&
+ !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
+ break;
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, end, &cached_state);
+ cond_resched();
+ }
+
+ /*
+ * We don't use the higher-level delalloc space functions because our
+ * num_bytes and disk_num_bytes are different.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
+ if (ret)
+ goto out_unlock;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
+ if (ret)
+ goto out_free_data_space;
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
+ false);
+ if (ret)
+ goto out_qgroup_free_data;
+
+ /* Try an inline extent first. */
+ if (start == 0 && encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0) {
+ ret = cow_file_range_inline(inode, encoded->len, orig_count,
+ compression, pages, true);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = orig_count;
+ goto out_delalloc_release;
+ }
+ }
+
+ ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
+ disk_num_bytes, 0, 0, &ins, 1, 1);
+ if (ret)
+ goto out_delalloc_release;
+ extent_reserved = true;
+
+ em = create_io_em(inode, start, num_bytes,
+ start - encoded->unencoded_offset, ins.objectid,
+ ins.offset, ins.offset, ram_bytes, compression,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserved;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ins.objectid, ins.offset,
+ encoded->unencoded_offset,
+ (1 << BTRFS_ORDERED_ENCODED) |
+ (1 << BTRFS_ORDERED_COMPRESSED),
+ compression);
+ if (ret) {
+ btrfs_drop_extent_map_range(inode, start, end, false);
+ goto out_free_reserved;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ if (start + encoded->len > inode->vfs_inode.i_size)
+ i_size_write(&inode->vfs_inode, start + encoded->len);
+
+ unlock_extent(io_tree, start, end, &cached_state);
+
+ btrfs_delalloc_release_extents(inode, num_bytes);
+
+ if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, NULL,
+ false)) {
+ btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+ ret = -EIO;
+ goto out_pages;
+ }
+ ret = orig_count;
+ goto out;
+
+out_free_reserved:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+ btrfs_delalloc_release_extents(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+out_qgroup_free_data:
+ if (ret < 0)
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+ * bytes_may_use.
+ */
+ if (!extent_reserved)
+ btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+out_unlock:
+ unlock_extent(io_tree, start, end, &cached_state);
+out_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kvfree(pages);
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+ return ret;
+}
+
#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a
@@ -10098,6 +10849,7 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
sp->ptr = ptr;
sp->inode = inode;
sp->is_block_group = is_block_group;
+ sp->bg_extent_count = 1;
spin_lock(&fs_info->swapfile_pins_lock);
p = &fs_info->swapfile_pins.rb_node;
@@ -10111,6 +10863,8 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
(sp->ptr == entry->ptr && sp->inode > entry->inode)) {
p = &(*p)->rb_right;
} else {
+ if (is_block_group)
+ entry->bg_extent_count++;
spin_unlock(&fs_info->swapfile_pins_lock);
kfree(sp);
return 1;
@@ -10136,8 +10890,11 @@ static void btrfs_free_swapfile_pins(struct inode *inode)
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (sp->inode == inode) {
rb_erase(&sp->node, &fs_info->swapfile_pins);
- if (sp->is_block_group)
+ if (sp->is_block_group) {
+ btrfs_dec_block_group_swap_extents(sp->ptr,
+ sp->bg_extent_count);
btrfs_put_block_group(sp->ptr);
+ }
kfree(sp);
}
node = next;
@@ -10159,9 +10916,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10169,6 +10936,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
@@ -10198,7 +10966,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
@@ -10237,29 +11006,58 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
}
+
+ /*
+ * Prevent snapshot creation while we are activating the swap file.
+ * We do not want to race with snapshot creation. If snapshot creation
+ * already started before we bumped nr_swapfiles from 0 to 1 and
+ * completes before the first write into the swap file after it is
+ * activated, than that write would fallback to COW.
+ */
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because snapshot creation is in progress");
+ return -EINVAL;
+ }
/*
* Snapshots can create extents which require COW even if NODATACOW is
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
*/
- atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
+ atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+ lock_extent(io_tree, 0, isize - 1, &cached_state);
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
@@ -10300,7 +11098,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -10352,6 +11150,17 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
+ if (!btrfs_inc_block_group_swap_extents(bg)) {
+ btrfs_warn(fs_info,
+ "block group for swapfile at %llu is read-only%s",
+ bg->start,
+ atomic_read(&fs_info->scrubs_running) ?
+ " (scrub running)" : "");
+ btrfs_put_block_group(bg);
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = btrfs_add_swapfile_pin(inode, bg, true);
if (ret) {
btrfs_put_block_group(bg);
@@ -10385,12 +11194,14 @@ out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
- unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+ unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -10415,6 +11226,62 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
#endif
+/*
+ * Update the number of bytes used in the VFS' inode. When we replace extents in
+ * a range (clone, dedupe, fallocate's zero range), we must update the number of
+ * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
+ * always get a correct value.
+ */
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes)
+{
+ if (add_bytes == del_bytes)
+ return;
+
+ spin_lock(&inode->lock);
+ if (del_bytes > 0)
+ inode_sub_bytes(&inode->vfs_inode, del_bytes);
+ if (add_bytes > 0)
+ inode_add_bytes(&inode->vfs_inode, add_bytes);
+ spin_unlock(&inode->lock);
+}
+
+/**
+ * Verify that there are no ordered extents for a given file range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the file range, should be sector size aligned.
+ * @end: End offset (inclusive) of the file range, its value +1 should be
+ * sector size aligned.
+ *
+ * This should typically be used for cases where we locked an inode's VFS lock in
+ * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range, we have waited for all ordered
+ * extents in the range to complete and finally we have locked the file range in
+ * the inode's io_tree.
+ */
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_ordered_extent *ordered;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
+ if (ordered) {
+ btrfs_err(root->fs_info,
+"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
+ start, end, btrfs_ino(inode), root->root_key.objectid,
+ ordered->file_offset,
+ ordered->file_offset + ordered->num_bytes - 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ ASSERT(ordered == NULL);
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -10433,6 +11300,8 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
};
static const struct file_operations btrfs_dir_file_operations = {
@@ -10448,12 +11317,6 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -10467,14 +11330,14 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
* For now we're avoiding this by dropping bmap.
*/
static const struct address_space_operations btrfs_aops = {
- .readpage = btrfs_readpage,
- .writepage = btrfs_writepage,
+ .read_folio = btrfs_read_folio,
.writepages = btrfs_writepages,
- .readpages = btrfs_readpages,
- .direct_IO = btrfs_direct_IO,
- .invalidatepage = btrfs_invalidatepage,
- .releasepage = btrfs_releasepage,
- .set_page_dirty = btrfs_set_page_dirty,
+ .readahead = btrfs_readahead,
+ .direct_IO = noop_direct_IO,
+ .invalidate_folio = btrfs_invalidate_folio,
+ .release_folio = btrfs_release_folio,
+ .migrate_folio = btrfs_migrate_folio,
+ .dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
@@ -10489,6 +11352,8 @@ static const struct inode_operations btrfs_file_inode_operations = {
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4f4b13830b25..d5dd8bed1488 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -26,14 +26,17 @@
#include <linux/btrfs.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
+#include <linux/fileattr.h>
+#include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
#include "ctree.h"
#include "disk-io.h"
+#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
-#include "inode-map.h"
#include "backref.h"
#include "rcu-string.h"
#include "send.h"
@@ -46,6 +49,7 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "subpage.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -79,16 +83,31 @@ struct btrfs_ioctl_send_args_32 {
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
- __u64 reserved[4]; /* in */
+ __u32 version; /* in */
+ __u8 reserved[28]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32)
-#endif
-static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff,
- int no_time_update);
+struct btrfs_ioctl_encoded_io_args_32 {
+ compat_uptr_t iov;
+ compat_ulong_t iovcnt;
+ __s64 offset;
+ __u64 flags;
+ __u64 len;
+ __u64 unencoded_len;
+ __u64 unencoded_offset;
+ __u32 compression;
+ __u32 encryption;
+ __u8 reserved[64];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#endif
/* Mask out flags that are inappropriate for the given type of inode. */
static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
@@ -106,9 +125,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
{
unsigned int iflags = 0;
+ u32 flags = binode->flags;
+ u32 ro_flags = binode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -124,6 +145,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
iflags |= FS_DIRSYNC_FL;
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
+ if (ro_flags & BTRFS_INODE_RO_VERITY)
+ iflags |= FS_VERITY_FL;
if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
@@ -151,24 +174,19 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (binode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
+ if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ new_fl |= S_VERITY;
set_mask_bits(&inode->i_flags,
- S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
- new_fl);
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+ S_VERITY, new_fl);
}
-static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
-{
- struct btrfs_inode *binode = BTRFS_I(file_inode(file));
- unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
-
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -EFAULT;
- return 0;
-}
-
-/* Check if @flags are a supported and valid set of FS_*_FL flags */
-static int check_fsflags(unsigned int flags)
+/*
+ * Check if @flags are a supported and valid set of FS_*_FL flags and that
+ * the old and new flags are not conflicting
+ */
+static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -177,15 +195,47 @@ static int check_fsflags(unsigned int flags)
FS_NOCOW_FL))
return -EOPNOTSUPP;
+ /* COMPR and NOCOMP on new/old are valid */
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
return -EINVAL;
+ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
+ return -EINVAL;
+
+ /* NOCOW and compression options are mutually exclusive */
+ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+
return 0;
}
-static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+ unsigned int flags)
{
- struct inode *inode = file_inode(file);
+ if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
+ return -EPERM;
+
+ return 0;
+}
+
+/*
+ * Set flags/xflags from the internal inode flags. The remaining items of
+ * fsxattr are zeroed.
+ */
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
+ return 0;
+}
+
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
@@ -193,33 +243,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags = binode->flags;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
+ u32 binode_flags;
if (btrfs_root_readonly(root))
return -EROFS;
- if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
- return -EFAULT;
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- ret = check_fsflags(fsflags);
+ fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(binode);
+ ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
- ret = mnt_want_write_file(file);
+ ret = check_fsflags_compatible(fs_info, fsflags);
if (ret)
return ret;
- inode_lock(inode);
-
- fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
- ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
- if (ret)
- goto out_unlock;
-
+ binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
else
@@ -240,6 +282,16 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
binode_flags |= BTRFS_INODE_NOATIME;
else
binode_flags &= ~BTRFS_INODE_NOATIME;
+
+ /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
+ if (!fa->flags_valid) {
+ /* 1 item for the inode */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto update_flags;
+ }
+
if (fsflags & FS_DIRSYNC_FL)
binode_flags |= BTRFS_INODE_DIRSYNC;
else
@@ -280,10 +332,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
binode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- goto out_unlock;
- }
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
binode_flags |= BTRFS_INODE_COMPRESS;
binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
@@ -300,10 +350,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* 2 for properties
*/
trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
if (comp) {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
@@ -321,161 +369,98 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
}
+update_flags:
binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out_end_trans:
btrfs_end_transaction(trans);
- out_unlock:
- inode_unlock(inode);
- mnt_drop_write_file(file);
return ret;
}
/*
- * Translate btrfs internal inode flags to xflags as expected by the
- * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
- * silently dropped.
+ * Start exclusive operation @type, return true on success
*/
-static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
{
- unsigned int xflags = 0;
+ bool ret = false;
- if (flags & BTRFS_INODE_APPEND)
- xflags |= FS_XFLAG_APPEND;
- if (flags & BTRFS_INODE_IMMUTABLE)
- xflags |= FS_XFLAG_IMMUTABLE;
- if (flags & BTRFS_INODE_NOATIME)
- xflags |= FS_XFLAG_NOATIME;
- if (flags & BTRFS_INODE_NODUMP)
- xflags |= FS_XFLAG_NODUMP;
- if (flags & BTRFS_INODE_SYNC)
- xflags |= FS_XFLAG_SYNC;
-
- return xflags;
-}
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+ fs_info->exclusive_operation = type;
+ ret = true;
+ }
+ spin_unlock(&fs_info->super_lock);
-/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
-static int check_xflags(unsigned int flags)
-{
- if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
- FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
- return -EOPNOTSUPP;
- return 0;
+ return ret;
}
/*
- * Set the xflags from the internal inode flags. The remaining items of fsxattr
- * are zeroed.
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one. This must be paired with btrfs_exclop_start_unlock and
+ * btrfs_exclop_finish.
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - when trying to add a device and balance has been paused
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ * must check the condition first that would allow none -> @type
*/
-static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
{
- struct btrfs_inode *binode = BTRFS_I(file_inode(file));
- struct fsxattr fa;
-
- simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == type ||
+ (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+ type == BTRFS_EXCLOP_DEV_ADD))
+ return true;
- return 0;
+ spin_unlock(&fs_info->super_lock);
+ return false;
}
-static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
{
- struct inode *inode = file_inode(file);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
- struct btrfs_trans_handle *trans;
- struct fsxattr fa, old_fa;
- unsigned old_flags;
- unsigned old_i_flags;
- int ret = 0;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (btrfs_root_readonly(root))
- return -EROFS;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- ret = check_xflags(fa.fsx_xflags);
- if (ret)
- return ret;
-
- if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- old_flags = binode->flags;
- old_i_flags = inode->i_flags;
-
- simple_fill_fsxattr(&old_fa,
- btrfs_inode_flags_to_xflags(binode->flags));
- ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (ret)
- goto out_unlock;
-
- if (fa.fsx_xflags & FS_XFLAG_SYNC)
- binode->flags |= BTRFS_INODE_SYNC;
- else
- binode->flags &= ~BTRFS_INODE_SYNC;
- if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
- binode->flags |= BTRFS_INODE_IMMUTABLE;
- else
- binode->flags &= ~BTRFS_INODE_IMMUTABLE;
- if (fa.fsx_xflags & FS_XFLAG_APPEND)
- binode->flags |= BTRFS_INODE_APPEND;
- else
- binode->flags &= ~BTRFS_INODE_APPEND;
- if (fa.fsx_xflags & FS_XFLAG_NODUMP)
- binode->flags |= BTRFS_INODE_NODUMP;
- else
- binode->flags &= ~BTRFS_INODE_NODUMP;
- if (fa.fsx_xflags & FS_XFLAG_NOATIME)
- binode->flags |= BTRFS_INODE_NOATIME;
- else
- binode->flags &= ~BTRFS_INODE_NOATIME;
-
- /* 1 item for the inode */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
-
- btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ spin_unlock(&fs_info->super_lock);
+}
- btrfs_end_transaction(trans);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->super_lock);
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ spin_unlock(&fs_info->super_lock);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
-out_unlock:
- if (ret) {
- binode->flags = old_flags;
- inode->i_flags = old_i_flags;
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op)
+{
+ switch (op) {
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ default:
+ btrfs_warn(fs_info,
+ "invalid exclop balance operation %d requested", op);
}
-
- inode_unlock(inode);
- mnt_drop_write_file(file);
-
- return ret;
}
-static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
{
- struct inode *inode = file_inode(file);
-
return put_user(inode->i_generation, arg);
}
@@ -483,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_device *device;
- struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
@@ -493,6 +477,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return -EPERM;
/*
+ * btrfs_trim_block_group() depends on space cache, which is not
+ * available in zoned filesystem. So, disallow fitrim on a zoned
+ * filesystem for now.
+ */
+ if (btrfs_is_zoned(fs_info))
+ return -EOPNOTSUPP;
+
+ /*
* If the fs is mounted with nologreplay, which requires it to be
* mounted in RO mode as well, we can not allow discard on free space
* inside block groups, because log trees refer to extents that are not
@@ -505,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
- if (!device->bdev)
+ if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
continue;
- q = bdev_get_queue(device->bdev);
- if (blk_queue_discard(q)) {
- num_devices++;
- minlen = min_t(u64, q->limits.discard_granularity,
- minlen);
- }
+ num_devices++;
+ minlen = min_t(u64, bdev_discard_granularity(device->bdev),
+ minlen);
}
rcu_read_unlock();
@@ -551,10 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
-static noinline int create_subvol(struct inode *dir,
- struct dentry *dentry,
- const char *name, int namelen,
- u64 *async_transid,
+/*
+ * Calculate the number of transaction items to reserve for creating a subvolume
+ * or snapshot, not including the inode, directory entries, or parent directory.
+ */
+static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+{
+ /*
+ * 1 to add root block
+ * 1 to add root item
+ * 1 to add root ref
+ * 1 to add root backref
+ * 1 to add UUID item
+ * 1 to add qgroup info
+ * 1 to add qgroup limit
+ *
+ * Ideally the last two would only be accounted if qgroups are enabled,
+ * but that can change between now and the time we would insert them.
+ */
+ unsigned int num_items = 7;
+
+ if (inherit) {
+ /* 2 to add qgroup relations for each inherited qgroup */
+ num_items += 2 * inherit->num_qgroups;
+ }
+ return num_items;
+}
+
+static noinline int create_subvol(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -567,21 +581,23 @@ static noinline int create_subvol(struct inode *dir,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
- struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .subvol = true,
+ };
+ unsigned int trans_num_items;
int ret;
- int err;
+ dev_t anon_dev;
u64 objectid;
- u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
- u64 index = 0;
- uuid_le new_uuid;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
return -ENOMEM;
- ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
+ ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto fail_free;
+ goto out_root_item;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
@@ -589,35 +605,47 @@ static noinline int create_subvol(struct inode *dir,
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
- goto fail_free;
+ goto out_root_item;
}
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto out_root_item;
+
+ new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+ if (!new_inode_args.inode) {
+ ret = -ENOMEM;
+ goto out_anon_dev;
+ }
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+ trans_num_items += create_subvol_num_items(inherit);
+
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * The same as the snapshot creation, please see the comment
- * of create_snapshot().
- */
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ trans_num_items, false);
if (ret)
- goto fail_free;
+ goto out_new_inode_args;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
- goto fail_free;
+ btrfs_subvolume_release_metadata(root, &block_rsv);
+ goto out_new_inode_args;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
- goto fail;
+ goto out;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
- goto fail;
+ goto out;
}
btrfs_mark_buffer_dirty(leaf);
@@ -643,8 +671,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_generation_v2(root_item,
btrfs_root_generation(root_item));
- uuid_le_gen(&new_uuid);
- memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(root_item->uuid);
btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
root_item->ctime = root_item->otime;
@@ -652,118 +679,106 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- leaf = NULL;
- btrfs_set_root_dirid(root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
- if (ret)
- goto fail;
-
- key.offset = (u64)-1;
- new_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- btrfs_record_root_in_trans(trans, new_root);
-
- ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
if (ret) {
- /* We potentially lose an unused inode item here */
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ /*
+ * Since we don't abort the transaction in this case, free the
+ * tree block so that we don't leak space and leave the
+ * filesystem in an inconsistent state (an extent item in the
+ * extent tree with a backreference for a root that does not
+ * exists).
+ */
+ btrfs_tree_lock(leaf);
+ btrfs_clean_tree_block(leaf);
+ btrfs_tree_unlock(leaf);
+ btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ free_extent_buffer(leaf);
+ goto out;
}
- mutex_lock(&new_root->objectid_mutex);
- new_root->highest_objectid = new_dirid;
- mutex_unlock(&new_root->objectid_mutex);
+ free_extent_buffer(leaf);
+ leaf = NULL;
- /*
- * insert the directory item
- */
- ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (ret) {
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
+ /* anon_dev is owned by new_root now. */
+ anon_dev = 0;
+ BTRFS_I(new_inode_args.inode)->root = new_root;
+ /* ... and new_root is owned by new_inode_args.inode now. */
- ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
- BTRFS_FT_DIR, index);
+ ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
- btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_uuid_tree_add(trans, root_item->uuid,
- BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ d_instantiate_new(dentry, new_inode_args.inode);
+ new_inode_args.inode = NULL;
-fail:
- kfree(root_item);
+out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
-
- if (async_transid) {
- *async_transid = trans->transid;
- err = btrfs_commit_transaction_async(trans, 1);
- if (err)
- err = btrfs_commit_transaction(trans);
- } else {
- err = btrfs_commit_transaction(trans);
- }
- if (err && !ret)
- ret = err;
+ btrfs_subvolume_release_metadata(root, &block_rsv);
- if (!ret) {
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- d_instantiate(dentry, inode);
- }
- return ret;
-
-fail_free:
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ ret = btrfs_commit_transaction(trans);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ iput(new_inode_args.inode);
+out_anon_dev:
+ if (anon_dev)
+ free_anon_bdev(anon_dev);
+out_root_item:
kfree(root_item);
return ret;
}
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
- struct dentry *dentry,
- u64 *async_transid, bool readonly,
+ struct dentry *dentry, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
+ unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
int ret;
- bool snapshot_force_cow = false;
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ /* We do not support snapshotting right now. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_warn(fs_info,
+ "extent tree v2 doesn't support snapshotting yet");
+ return -EOPNOTSUPP;
+ }
+
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
if (atomic_read(&root->nr_swapfiles)) {
@@ -776,6 +791,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!pending_snapshot)
return -ENOMEM;
+ ret = get_anon_bdev(&pending_snapshot->anon_dev);
+ if (ret < 0)
+ goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
@@ -784,46 +802,19 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
- /*
- * Force new buffered writes to reserve space even when NOCOW is
- * possible. This is to avoid later writeback (running dealloc) to
- * fallback to COW mode and unexpectedly fail with ENOSPC.
- */
- atomic_inc(&root->will_be_snapshotted);
- smp_mb__after_atomic();
- /* wait for no snapshot writes */
- wait_event(root->subv_writers->wait,
- percpu_counter_sum(&root->subv_writers->counter) == 0);
-
- ret = btrfs_start_delalloc_snapshot(root);
- if (ret)
- goto dec_and_free;
-
- /*
- * All previous writes have started writeback in NOCOW mode, so now
- * we force future writes to fallback to COW mode during snapshot
- * creation.
- */
- atomic_inc(&root->snapshot_force_cow);
- snapshot_force_cow = true;
-
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - parent dir inode
- * 2 - dir entries
- * 1 - root item
- * 2 - root ref/backref
- * 1 - root of snapshot
- * 1 - UUID item
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
*/
+ trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv, 8,
- false);
+ &pending_snapshot->block_rsv,
+ trans_num_items, false);
if (ret)
- goto dec_and_free;
+ goto free_pending;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -837,18 +828,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto fail;
}
- spin_lock(&fs_info->trans_lock);
- list_add(&pending_snapshot->list,
- &trans->transaction->pending_snapshots);
- spin_unlock(&fs_info->trans_lock);
- if (async_transid) {
- *async_transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, 1);
- if (ret)
- ret = btrfs_commit_transaction(trans);
- } else {
- ret = btrfs_commit_transaction(trans);
- }
+ trans->pending_snapshot = pending_snapshot;
+
+ ret = btrfs_commit_transaction(trans);
if (ret)
goto fail;
@@ -868,14 +850,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode);
ret = 0;
+ pending_snapshot->anon_dev = 0;
fail:
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
-dec_and_free:
- if (snapshot_force_cow)
- atomic_dec(&root->snapshot_force_cow);
- if (atomic_dec_and_test(&root->will_be_snapshotted))
- wake_up_var(&root->will_be_snapshotted);
+ /* Prevent double freeing of anon_dev */
+ if (ret && pending_snapshot->snap)
+ pending_snapshot->snap->anon_dev = 0;
+ btrfs_put_root(pending_snapshot->snap);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
+ if (pending_snapshot->anon_dev)
+ free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);
@@ -903,7 +887,8 @@ free_pending:
* nfs_async_unlink().
*/
-static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+static int btrfs_may_delete(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *victim, int isdir)
{
int error;
@@ -913,13 +898,14 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
- IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+ if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
+ IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+ IS_SWAPFILE(d_inode(victim)))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
@@ -936,13 +922,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
}
/* copy of may_create in fs/namei.c() */
-static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+static inline int btrfs_may_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
+ return -EOVERFLOW;
+ return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -951,9 +940,10 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* inside this filesystem so it's quite a bit simpler.
*/
static noinline int btrfs_mksubvol(const struct path *parent,
+ struct user_namespace *mnt_userns,
const char *name, int namelen,
struct btrfs_root *snap_src,
- u64 *async_transid, bool readonly,
+ bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent->dentry);
@@ -965,12 +955,12 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (error == -EINTR)
return error;
- dentry = lookup_one_len(name, parent->dentry, namelen);
+ dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
- error = btrfs_may_create(dir, dentry);
+ error = btrfs_may_create(mnt_userns, dir, dentry);
if (error)
goto out_dput;
@@ -989,13 +979,11 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
goto out_up_read;
- if (snap_src) {
- error = create_snapshot(snap_src, dir, dentry,
- async_transid, readonly, inherit);
- } else {
- error = create_subvol(dir, dentry, name, namelen,
- async_transid, inherit);
- }
+ if (snap_src)
+ error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ else
+ error = create_subvol(mnt_userns, dir, dentry, inherit);
+
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
@@ -1003,133 +991,237 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- inode_unlock(dir);
+ btrfs_inode_unlock(dir, 0);
return error;
}
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
+static noinline int btrfs_mksnapshot(const struct path *parent,
+ struct user_namespace *mnt_userns,
+ const char *name, int namelen,
+ struct btrfs_root *root,
+ bool readonly,
+ struct btrfs_qgroup_inherit *inherit)
{
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- u64 end;
+ int ret;
+ bool snapshot_force_cow = false;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
- read_unlock(&em_tree->lock);
+ /*
+ * Force new buffered writes to reserve space even when NOCOW is
+ * possible. This is to avoid later writeback (running dealloc) to
+ * fallback to COW mode and unexpectedly fail with ENOSPC.
+ */
+ btrfs_drew_read_lock(&root->snapshot_lock);
- if (em) {
- end = extent_map_end(em);
- free_extent_map(em);
- if (end - offset > thresh)
- return 0;
- }
- /* if we already have a nice delalloc here, just stop */
- thresh /= 2;
- end = count_range_bits(io_tree, &offset, offset + thresh,
- thresh, EXTENT_DELALLOC, 1);
- if (end >= thresh)
- return 0;
- return 1;
+ ret = btrfs_start_delalloc_snapshot(root, false);
+ if (ret)
+ goto out;
+
+ /*
+ * All previous writes have started writeback in NOCOW mode, so now
+ * we force future writes to fallback to COW mode during snapshot
+ * creation.
+ */
+ atomic_inc(&root->snapshot_force_cow);
+ snapshot_force_cow = true;
+
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
+ root, readonly, inherit);
+out:
+ if (snapshot_force_cow)
+ atomic_dec(&root->snapshot_force_cow);
+ btrfs_drew_read_unlock(&root->snapshot_lock);
+ return ret;
}
/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
*
- * This is used by the defragging code to find new and small
- * extents
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
*/
-static int find_new_extents(struct btrfs_root *root,
- struct inode *inode, u64 newer_than,
- u64 *off, u32 thresh)
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
{
- struct btrfs_path *path;
- struct btrfs_key min_key;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *extent;
- int type;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
- min_key.objectid = ino;
- min_key.type = BTRFS_EXTENT_DATA_KEY;
- min_key.offset = *off;
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
- while (1) {
- ret = btrfs_search_forward(root, &min_key, path, newer_than);
- if (ret != 0)
- goto none;
-process_slot:
- if (min_key.objectid != ino)
- goto none;
- if (min_key.type != BTRFS_EXTENT_DATA_KEY)
- goto none;
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG &&
- btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
- check_defrag_in_cache(inode, min_key.offset, thresh)) {
- *off = min_key.offset;
- btrfs_free_path(path);
- return 0;
- }
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
- goto process_slot;
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
}
- if (min_key.offset == (u64)-1)
- goto none;
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
- min_key.offset++;
- btrfs_release_path(path);
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
}
-none:
- btrfs_free_path(path);
- return -ENOENT;
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
}
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+ u64 newer_than, bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_SIZE;
+ const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
if (!em) {
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ u64 end = start + sectorsize - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
- unlock_extent_cached(io_tree, start, end, &cached);
+ if (!locked)
+ lock_extent(io_tree, start, end, &cached);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+ if (!locked)
+ unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1138,274 +1230,586 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
return em;
}
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+ const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return fs_info->max_extent_size;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+ u32 extent_thresh, u64 newer_than, bool locked)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *next;
- bool ret = true;
+ bool ret = false;
/* this is the last extent */
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len);
+ /*
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
+ /* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- ret = false;
- else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > SZ_128K && next->block_len > SZ_128K))
- ret = false;
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(fs_info, em))
+ goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+ ret = true;
+out:
free_extent_map(next);
return ret;
}
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
- u64 *last_len, u64 *skip, u64 *defrag_end,
- int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+ pgoff_t index)
{
- struct extent_map *em;
- int ret = 1;
- bool next_mergeable = true;
- bool prev_mergeable = true;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ gfp_t mask = btrfs_alloc_write_mask(mapping);
+ u64 page_start = (u64)index << PAGE_SHIFT;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ struct extent_state *cached_state = NULL;
+ struct page *page;
+ int ret;
+
+again:
+ page = find_or_create_page(mapping, index, mask);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
/*
- * make sure that once we start defragging an extent, we keep on
- * defragging it
+ * Since we can defragment files opened read-only, we can encounter
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+ * can't do I/O using huge pages yet, so return an error for now.
+ * Filesystem transparent huge pages are typically only used for
+ * executables that explicitly enable them, so this isn't very
+ * restrictive.
*/
- if (start < *defrag_end)
- return 1;
+ if (PageCompound(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ETXTBSY);
+ }
- *skip = 0;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(ret);
+ }
- em = defrag_lookup_extent(inode, start);
- if (!em)
- return 0;
+ /* Wait for any existing ordered extent in the range */
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
- /* this will cover holes, and inline extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- ret = 0;
- goto out;
- }
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
+ if (!ordered)
+ break;
- if (!*defrag_end)
- prev_mergeable = false;
+ unlock_page(page);
+ btrfs_start_ordered_extent(ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ lock_page(page);
+ /*
+ * We unlocked the page above, so we need check if it was
+ * released or not.
+ */
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ }
- next_mergeable = defrag_check_next_extent(inode, em);
- /*
- * we hit a real extent, if it is big or the next extent is not a
- * real extent, don't bother defragging it
- */
- if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
- ret = 0;
-out:
/*
- * last_len ends up being a counter of how many bytes we've defragged.
- * every time we choose not to defrag an extent, we reset *last_len
- * so that the next tiny extent will force a defrag.
- *
- * The end result of this is that tiny extents before a single big
- * extent will force at least part of that big extent to be defragged.
+ * Now the page range has no ordered extent any more. Read the page to
+ * make it uptodate.
*/
- if (ret) {
- *defrag_end = extent_map_end(em);
- } else {
- *last_len = 0;
- *skip = extent_map_end(em);
- *defrag_end = 0;
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (page->mapping != mapping || !PagePrivate(page)) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
}
-
- free_extent_map(em);
- return ret;
+ return page;
}
+struct defrag_target_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
/*
- * it doesn't do much good to defrag one or two pages
- * at a time. This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
*
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start: file offset to lookup
+ * @len: length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ * will be ignored
+ * @newer_than: only defrag extents newer than this value
+ * @do_compress: whether the defrag is doing compression
+ * if true, @extent_thresh will be ignored and all regular
+ * file extents meeting @newer_than will be targets.
+ * @locked: if the range has already held extent lock
+ * @target_list: list of targets file extents
*/
-static int cluster_pages_for_defrag(struct inode *inode,
- struct page **pages,
- unsigned long start_index,
- unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+ u64 start, u64 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
{
- unsigned long file_end;
- u64 isize = i_size_read(inode);
- u64 page_start;
- u64 page_end;
- u64 page_cnt;
- int ret;
- int i;
- int i_done;
- struct btrfs_ordered_extent *ordered;
- struct extent_state *cached_state = NULL;
- struct extent_io_tree *tree;
- struct extent_changeset *data_reserved = NULL;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ bool last_is_target = false;
+ u64 cur = start;
+ int ret = 0;
- file_end = (isize - 1) >> PAGE_SHIFT;
- if (!isize || start_index > file_end)
- return 0;
+ while (cur < start + len) {
+ struct extent_map *em;
+ struct defrag_target_range *new;
+ bool next_mergeable = true;
+ u64 range_len;
- page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur,
+ newer_than, locked);
+ if (!em)
+ break;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT);
- if (ret)
- return ret;
- i_done = 0;
- tree = &BTRFS_I(inode)->io_tree;
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
- /* step one, lock all the pages */
- for (i = 0; i < page_cnt; i++) {
- struct page *page;
-again:
- page = find_or_create_page(inode->i_mapping,
- start_index + i, mask);
- if (!page)
- break;
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ goto next;
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
- while (1) {
- lock_extent_bits(tree, page_start, page_end,
- &cached_state);
- ordered = btrfs_lookup_ordered_extent(inode,
- page_start);
- unlock_extent_cached(tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
+ /* Skip older extent */
+ if (em->generation < newer_than)
+ goto next;
- unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * we unlocked the page above, so we need check if
- * it was released or not.
- */
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
+
+ /*
+ * Our start offset might be in the middle of an existing extent
+ * map, so take that into account.
+ */
+ range_len = em->len - (cur - em->start);
+ /*
+ * If this range of the extent map is already flagged for delalloc,
+ * skip it, because:
+ *
+ * 1) We could deadlock later, when trying to reserve space for
+ * delalloc, because in case we can't immediately reserve space
+ * the flusher can start delalloc and wait for the respective
+ * ordered extents to complete. The deadlock would happen
+ * because we do the space reservation while holding the range
+ * locked, and starting writeback, or finishing an ordered
+ * extent, requires locking the range;
+ *
+ * 2) If there's delalloc there, it means there's dirty pages for
+ * which writeback has not started yet (we clean the delalloc
+ * flag when starting writeback and after creating an ordered
+ * extent). If we mark pages in an adjacent range for defrag,
+ * then we will have a larger contiguous range for delalloc,
+ * very likely resulting in a larger extent after writeback is
+ * triggered (except in a case of free space fragmentation).
+ */
+ if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC, 0, NULL))
+ goto next;
+
+ /*
+ * For do_compress case, we want to compress all valid file
+ * extents, thus no @extent_thresh or mergeable check.
+ */
+ if (do_compress)
+ goto add;
+
+ /* Skip too large extent */
+ if (range_len >= extent_thresh)
+ goto next;
+
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(fs_info, em))
+ goto next;
+
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
+ next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+ extent_thresh, newer_than, locked);
+ if (!next_mergeable) {
+ struct defrag_target_range *last;
+
+ /* Empty target list, no way to merge with last entry */
+ if (list_empty(target_list))
+ goto next;
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ /* Not mergeable with last entry */
+ if (last->start + last->len != cur)
+ goto next;
+
+ /* Mergeable, fall through to add it to @target_list. */
}
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- ret = -EIO;
- break;
+add:
+ last_is_target = true;
+ range_len = min(extent_map_end(em), start + len) - cur;
+ /*
+ * This one is a good target, check if it can be merged into
+ * last range of the target list.
+ */
+ if (!list_empty(target_list)) {
+ struct defrag_target_range *last;
+
+ last = list_entry(target_list->prev,
+ struct defrag_target_range, list);
+ ASSERT(last->start + last->len <= cur);
+ if (last->start + last->len == cur) {
+ /* Mergeable, enlarge the last entry */
+ last->len += range_len;
+ goto next;
}
+ /* Fall through to allocate a new entry */
}
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- put_page(page);
- goto again;
+ /* Allocate new defrag_target_range */
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new) {
+ free_extent_map(em);
+ ret = -ENOMEM;
+ break;
}
+ new->start = cur;
+ new->len = range_len;
+ list_add_tail(&new->list, target_list);
- pages[i] = page;
- i_done++;
+next:
+ cur = extent_map_end(em);
+ free_extent_map(em);
}
- if (!i_done || ret)
- goto out;
+ if (ret < 0) {
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- goto out;
+ list_for_each_entry_safe(entry, tmp, target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ }
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
+ return ret;
+}
- /*
- * so now we have a nice long stream of locked
- * and up to date pages, lets wait on them
- */
- for (i = 0; i < i_done; i++)
+#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode: target inode
+ * @target: target range to defrag
+ * @pages: locked pages covering the defrag range
+ * @nr_pages: number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ * Pages should be locked, no ordered extent in the pages range,
+ * no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+ struct defrag_target_range *target,
+ struct page **pages, int nr_pages,
+ struct extent_state **cached_state)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_changeset *data_reserved = NULL;
+ const u64 start = target->start;
+ const u64 len = target->len;
+ unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+ unsigned long start_index = start >> PAGE_SHIFT;
+ unsigned long first_index = page_index(pages[0]);
+ int ret = 0;
+ int i;
+
+ ASSERT(last_index - first_index + 1 <= nr_pages);
+
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+ if (ret < 0)
+ return ret;
+ clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
+
+ /* Update the page status */
+ for (i = start_index - first_index; i <= last_index - first_index; i++) {
+ ClearPageChecked(pages[i]);
+ btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ }
+ btrfs_delalloc_release_extents(inode, len);
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
+{
+ struct extent_state *cached_state = NULL;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ struct page **pages;
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+ u64 start_index = start >> PAGE_SHIFT;
+ unsigned int nr_pages = last_index - start_index + 1;
+ int ret = 0;
+ int i;
+
+ ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+ ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ /* Prepare all pages */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = defrag_prepare_one_page(inode, start_index + i);
+ if (IS_ERR(pages[i])) {
+ ret = PTR_ERR(pages[i]);
+ pages[i] = NULL;
+ goto free_pages;
+ }
+ }
+ for (i = 0; i < nr_pages; i++)
wait_on_page_writeback(pages[i]);
- page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+ /* Lock the pages range */
+ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+ /*
+ * Now we have a consistent view about the extent map, re-check
+ * which range really needs to be defragged.
+ *
+ * And this time we have extent locked already, pass @locked = true
+ * so that we won't relock the extent range and cause deadlock.
+ */
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, true,
+ &target_list, last_scanned_ret);
+ if (ret < 0)
+ goto unlock_extent;
- lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
- page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+ list_for_each_entry(entry, &target_list, list) {
+ ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+ &cached_state);
+ if (ret < 0)
+ break;
+ }
- if (i_done != page_cnt) {
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
- btrfs_delalloc_release_space(inode, data_reserved,
- start_index << PAGE_SHIFT,
- (page_cnt - i_done) << PAGE_SHIFT, true);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
}
+unlock_extent:
+ unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
+free_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+ kfree(pages);
+ return ret;
+}
+static int defrag_one_cluster(struct btrfs_inode *inode,
+ struct file_ra_state *ra,
+ u64 start, u32 len, u32 extent_thresh,
+ u64 newer_than, bool do_compress,
+ unsigned long *sectors_defragged,
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
+{
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct defrag_target_range *entry;
+ struct defrag_target_range *tmp;
+ LIST_HEAD(target_list);
+ int ret;
- set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
- &cached_state);
+ ret = defrag_collect_targets(inode, start, len, extent_thresh,
+ newer_than, do_compress, false,
+ &target_list, NULL);
+ if (ret < 0)
+ goto out;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, &cached_state);
+ list_for_each_entry(entry, &target_list, list) {
+ u32 range_len = entry->len;
- for (i = 0; i < i_done; i++) {
- clear_page_dirty_for_io(pages[i]);
- ClearPageChecked(pages[i]);
- set_page_extent_mapped(pages[i]);
- set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
+ /* Reached or beyond the limit */
+ if (max_sectors && *sectors_defragged >= max_sectors) {
+ ret = 1;
+ break;
+ }
+
+ if (max_sectors)
+ range_len = min_t(u32, range_len,
+ (max_sectors - *sectors_defragged) * sectorsize);
+
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
+ if (ra)
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ ra, NULL, entry->start >> PAGE_SHIFT,
+ ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+ (entry->start >> PAGE_SHIFT) + 1);
+ /*
+ * Here we may not defrag any range if holes are punched before
+ * we locked the pages.
+ * But that's fine, it only affects the @sectors_defragged
+ * accounting.
+ */
+ ret = defrag_one_range(inode, entry->start, range_len,
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
+ if (ret < 0)
+ break;
+ *sectors_defragged += range_len >>
+ inode->root->fs_info->sectorsize_bits;
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
- return i_done;
out:
- for (i = 0; i < i_done; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
- btrfs_delalloc_release_space(inode, data_reserved,
- start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
- extent_changeset_free(data_reserved);
+ list_for_each_entry_safe(entry, tmp, &target_list, list) {
+ list_del_init(&entry->list);
+ kfree(entry);
+ }
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
return ret;
-
}
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode: inode to be defragged
+ * @ra: readahead state (can be NUL)
+ * @range: defrag options including range and flags
+ * @newer_than: minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ * will be defragged.
+ *
+ * Return <0 for error.
+ * Return >=0 for the number of sectors defragged, and range->start will be updated
+ * to indicate the file offset where next defrag should be started at.
+ * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
+ * defragging all the range).
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct file_ra_state *ra = NULL;
- unsigned long last_index;
+ unsigned long sectors_defragged = 0;
u64 isize = i_size_read(inode);
- u64 last_len = 0;
- u64 skip = 0;
- u64 defrag_end = 0;
- u64 newer_off = range->start;
- unsigned long i;
- unsigned long ra_index = 0;
- int ret;
- int defrag_count = 0;
+ u64 cur;
+ u64 last_byte;
+ bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int ret = 0;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
- unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)SZ_128K - 1);
- struct page **pages = NULL;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+ pgoff_t start_index;
if (isize == 0)
return 0;
@@ -1423,178 +1827,162 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (extent_thresh == 0)
extent_thresh = SZ_256K;
+ if (range->start + range->len > range->start) {
+ /* Got a specific range */
+ last_byte = min(isize, range->start + range->len);
+ } else {
+ /* Defrag until file end */
+ last_byte = isize;
+ }
+
+ /* Align the range */
+ cur = round_down(range->start, fs_info->sectorsize);
+ last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
+
/*
- * If we were not given a file, allocate a readahead context. As
+ * If we were not given a ra, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
- if (!file) {
+ if (!ra) {
+ ra_allocated = true;
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
- } else {
- ra = &file->f_ra;
- }
-
- pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out_ra;
- }
-
- /* find the last page to defrag */
- if (range->start + range->len > range->start) {
- last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_SHIFT;
- } else {
- last_index = (isize - 1) >> PAGE_SHIFT;
}
- if (newer_than) {
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- /*
- * we always align our defrag to help keep
- * the extents in the file evenly spaced
- */
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else
- goto out_ra;
- } else {
- i = range->start >> PAGE_SHIFT;
- }
- if (!max_to_defrag)
- max_to_defrag = last_index - i + 1;
-
/*
- * make writeback starts from i, so the defrag range can be
- * written sequentially.
+ * Make writeback start from the beginning of the range, so that the
+ * defrag range can be written sequentially.
*/
- if (i < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = i;
+ start_index = cur >> PAGE_SHIFT;
+ if (start_index < inode->i_mapping->writeback_index)
+ inode->i_mapping->writeback_index = start_index;
- while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
- /*
- * make sure we stop running if someone unmounts
- * the FS
- */
- if (!(inode->i_sb->s_flags & SB_ACTIVE))
- break;
+ while (cur < last_byte) {
+ const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
+ u64 cluster_end;
if (btrfs_defrag_cancelled(fs_info)) {
- btrfs_debug(fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
- if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
- extent_thresh, &last_len, &skip,
- &defrag_end, do_compress)){
- unsigned long next;
- /*
- * the should_defrag function tells us how much to skip
- * bump our counter by the suggested amount
- */
- next = DIV_ROUND_UP(skip, PAGE_SIZE);
- i = max(i + 1, next);
- continue;
- }
-
- if (!newer_than) {
- cluster = (PAGE_ALIGN(defrag_end) >>
- PAGE_SHIFT) - i;
- cluster = min(cluster, max_cluster);
- } else {
- cluster = max_cluster;
- }
-
- if (i + cluster > ra_index) {
- ra_index = max(i, ra_index);
- if (ra)
- page_cache_sync_readahead(inode->i_mapping, ra,
- file, ra_index, cluster);
- ra_index += cluster;
- }
+ /* We want the cluster end at page boundary when possible */
+ cluster_end = (((cur >> PAGE_SHIFT) +
+ (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+ cluster_end = min(cluster_end, last_byte);
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
- } else {
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+ btrfs_inode_unlock(inode, 0);
+ break;
}
- if (ret < 0) {
- inode_unlock(inode);
- goto out_ra;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(inode, 0);
+ break;
}
+ if (do_compress)
+ BTRFS_I(inode)->defrag_compress = compress_type;
+ ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ cluster_end + 1 - cur, extent_thresh,
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
- defrag_count += ret;
- balance_dirty_pages_ratelimited(inode->i_mapping);
- inode_unlock(inode);
-
- if (newer_than) {
- if (newer_off == (u64)-1)
- break;
+ if (sectors_defragged > prev_sectors_defragged)
+ balance_dirty_pages_ratelimited(inode->i_mapping);
- if (ret > 0)
- i += ret;
-
- newer_off = max(newer_off + 1,
- (u64)i << PAGE_SHIFT);
-
- ret = find_new_extents(root, inode, newer_than,
- &newer_off, SZ_64K);
- if (!ret) {
- range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_SHIFT;
- } else {
- break;
- }
- } else {
- if (ret > 0) {
- i += ret;
- last_len += ret << PAGE_SHIFT;
- } else {
- i++;
- last_len = 0;
- }
+ btrfs_inode_unlock(inode, 0);
+ if (ret < 0)
+ break;
+ cur = max(cluster_end + 1, last_scanned);
+ if (ret > 0) {
+ ret = 0;
+ break;
}
+ cond_resched();
}
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
+ if (ra_allocated)
+ kfree(ra);
+ /*
+ * Update range.start for autodefrag, this will indicate where to start
+ * in next run.
+ */
+ range->start = cur;
+ if (sectors_defragged) {
+ /*
+ * We have defragged some sectors, for compression case they
+ * need to be written back immediately.
+ */
+ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
+ if (range->compress_type == BTRFS_COMPRESS_LZO)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+ else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+ btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+ ret = sectors_defragged;
}
+ if (do_compress) {
+ btrfs_inode_lock(inode, 0);
+ BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(inode, 0);
+ }
+ return ret;
+}
- if (range->compress_type == BTRFS_COMPRESS_LZO) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+/*
+ * Try to start exclusive operation @type or cancel it if it's running.
+ *
+ * Return:
+ * 0 - normal mode, newly claimed op started
+ * >0 - normal mode, something else is running,
+ * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
+ * ECANCELED - cancel mode, successful cancel
+ * ENOTCONN - cancel mode, operation not running anymore
+ */
+static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type, bool cancel)
+{
+ if (!cancel) {
+ /* Start normal op */
+ if (!btrfs_exclop_start(fs_info, type))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ /* Exclusive operation is now claimed */
+ return 0;
}
- ret = defrag_count;
+ /* Cancel running op */
+ if (btrfs_exclop_start_try_lock(fs_info, type)) {
+ /*
+ * This blocks any exclop finish from setting it to NONE, so we
+ * request cancellation. Either it runs and we will wait for it,
+ * or it has finished and no waiting will happen.
+ */
+ atomic_inc(&fs_info->reloc_cancel_req);
+ btrfs_exclop_start_unlock(fs_info);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
+ TASK_INTERRUPTIBLE);
-out_ra:
- if (do_compress) {
- inode_lock(inode);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- inode_unlock(inode);
+ return -ECANCELED;
}
- if (!file)
- kfree(ra);
- kfree(pages);
- return ret;
+
+ /* Something else is running or none */
+ return -ENOTCONN;
}
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
@@ -1609,6 +1997,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
char *devstr = NULL;
int ret = 0;
int mod = 0;
+ bool cancel;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1617,20 +2006,23 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- mnt_drop_write_file(file);
- return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- }
-
+ /*
+ * Read the arguments before checking exclusivity to be able to
+ * distinguish regular resize and cancel
+ */
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto out_drop;
}
-
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
sizestr = vol_args->name;
+ cancel = (strcmp("cancel", sizestr) == 0);
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
+ if (ret)
+ goto out_free;
+ /* Exclusive operation is now claimed */
+
devstr = strchr(sizestr, ':');
if (devstr) {
sizestr = devstr + 1;
@@ -1638,20 +2030,21 @@ static noinline int btrfs_ioctl_resize(struct file *file,
devstr = vol_args->name;
ret = kstrtoull(devstr, 10, &devid);
if (ret)
- goto out_free;
+ goto out_finish;
if (!devid) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
ret = -ENODEV;
- goto out_free;
+ goto out_finish;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1659,11 +2052,11 @@ static noinline int btrfs_ioctl_resize(struct file *file,
"resizer unable to apply on readonly device %llu",
devid);
ret = -EPERM;
- goto out_free;
+ goto out_finish;
}
if (!strcmp(sizestr, "max"))
- new_size = device->bdev->bd_inode->i_size;
+ new_size = bdev_nr_bytes(device->bdev);
else {
if (sizestr[0] == '-') {
mod = -1;
@@ -1675,13 +2068,13 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = memparse(sizestr, &retptr);
if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
}
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -EPERM;
- goto out_free;
+ goto out_finish;
}
old_size = btrfs_device_get_total_bytes(device);
@@ -1689,36 +2082,33 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
new_size = old_size - new_size;
} else if (mod > 0) {
if (new_size > ULLONG_MAX - old_size) {
ret = -ERANGE;
- goto out_free;
+ goto out_finish;
}
new_size = old_size + new_size;
}
if (new_size < SZ_256M) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
- if (new_size > device->bdev->bd_inode->i_size) {
+ if (new_size > bdev_nr_bytes(device->bdev)) {
ret = -EFBIG;
- goto out_free;
+ goto out_finish;
}
new_size = round_down(new_size, fs_info->sectorsize);
- btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
- rcu_str_deref(device->name), new_size);
-
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_free;
+ goto out_finish;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans);
@@ -1726,17 +2116,24 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = btrfs_shrink_device(device, new_size);
} /* equal, nothing need to do */
+ if (ret == 0 && new_size != old_size)
+ btrfs_info_in_rcu(fs_info,
+ "resize device %s (devid %llu) from %llu to %llu",
+ rcu_str_deref(device->name), device->devid,
+ old_size, new_size);
+out_finish:
+ btrfs_exclop_finish(fs_info);
out_free:
kfree(vol_args);
-out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+out_drop:
mnt_drop_write_file(file);
return ret;
}
-static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+static noinline int __btrfs_ioctl_snap_create(struct file *file,
+ struct user_namespace *mnt_userns,
const char *name, unsigned long fd, int subvol,
- u64 *transid, bool readonly,
+ bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
int namelen;
@@ -1762,8 +2159,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, transid, readonly, inherit);
+ ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
+ namelen, NULL, readonly, inherit);
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
@@ -1777,16 +2174,17 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
- } else if (!inode_owner_or_capable(src_inode)) {
+ } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
* are limited to own subvolumes only
*/
ret = -EPERM;
} else {
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
- BTRFS_I(src_inode)->root,
- transid, readonly, inherit);
+ ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
+ name, namelen,
+ BTRFS_I(src_inode)->root,
+ readonly, inherit);
}
fdput(src);
}
@@ -1810,9 +2208,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol,
- NULL, false, NULL);
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ false, NULL);
kfree(vol_args);
return ret;
@@ -1823,8 +2221,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
- u64 transid = 0;
- u64 *ptr = NULL;
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;
@@ -1836,26 +2232,18 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (vol_args->flags &
- ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
- BTRFS_SUBVOL_QGROUP_INHERIT)) {
+ if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto free_args;
}
- if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) {
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- btrfs_warn(fs_info,
-"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7");
-
- ptr = &transid;
- }
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- if (vol_args->size > PAGE_SIZE) {
+ u64 nums;
+
+ if (vol_args->size < sizeof(*inherit) ||
+ vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
@@ -1864,20 +2252,27 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = PTR_ERR(inherit);
goto free_args;
}
+
+ if (inherit->num_qgroups > PAGE_SIZE ||
+ inherit->num_ref_copies > PAGE_SIZE ||
+ inherit->num_excl_copies > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
+
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ if (vol_args->size != struct_size(inherit, qgroups, nums)) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
}
- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol, ptr,
- readonly, inherit);
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ readonly, inherit);
if (ret)
goto free_inherit;
-
- if (ptr && copy_to_user(arg +
- offsetof(struct btrfs_ioctl_vol_args_v2,
- transid),
- ptr, sizeof(*ptr)))
- ret = -EFAULT;
-
free_inherit:
kfree(inherit);
free_args:
@@ -1885,10 +2280,9 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
void __user *arg)
{
- struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1919,7 +2313,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
u64 flags;
int ret = 0;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
return -EPERM;
ret = mnt_want_write_file(file);
@@ -1936,11 +2330,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_drop_write;
}
- if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
- ret = -EINVAL;
- goto out_drop_write;
- }
-
if (flags & ~BTRFS_SUBVOL_RDONLY) {
ret = -EOPNOTSUPP;
goto out_drop_write;
@@ -2057,7 +2446,7 @@ static noinline int copy_to_sk(struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
item_off = btrfs_item_ptr_offset(leaf, i);
- item_len = btrfs_item_size_nr(leaf, i);
+ item_len = btrfs_item_size(leaf, i);
btrfs_item_key_to_cpu(leaf, key, i);
if (!key_in_sk(key, sk))
@@ -2090,9 +2479,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2100,10 +2494,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2174,12 +2572,9 @@ static noinline int search_ioctl(struct inode *inode,
if (sk->tree_id == 0) {
/* search the root of the inode that was passed */
- root = BTRFS_I(inode)->root;
+ root = btrfs_grab_root(BTRFS_I(inode)->root);
} else {
- key.objectid = sk->tree_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(info, &key);
+ root = btrfs_get_fs_root(info, sk->tree_id, true);
if (IS_ERR(root)) {
btrfs_free_path(path);
return PTR_ERR(root);
@@ -2191,6 +2586,15 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = -EFAULT;
+ /*
+ * Ensure that the whole user buffer is faulted in at sub-page
+ * granularity, otherwise the loop may live-lock.
+ */
+ if (fault_in_subpage_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset))
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -2208,30 +2612,27 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
err:
sk->nr_items = num_found;
+ btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct file *file,
- void __user *argp)
+static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+ void __user *argp)
{
- struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
- struct inode *inode;
int ret;
size_t buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- uargs = (struct btrfs_ioctl_search_args __user *)argp;
-
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
buf_size = sizeof(uargs->buf);
- inode = file_inode(file);
ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
/*
@@ -2246,12 +2647,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
- struct inode *inode;
int ret;
size_t buf_size;
const size_t buf_limit = SZ_16M;
@@ -2260,7 +2660,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
return -EPERM;
/* copy search header and buffer size */
- uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
@@ -2270,7 +2669,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
if (buf_size > buf_limit)
buf_size = buf_limit;
- inode = file_inode(file);
ret = search_ioctl(inode, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
@@ -2311,12 +2709,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
- key.objectid = tree_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(info, &key);
+ root = btrfs_get_fs_root(info, tree_id, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
+ root = NULL;
goto out;
}
@@ -2325,23 +2721,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
else if (ret > 0) {
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0)
- goto out;
- else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ ret = -ENOENT;
+ goto out;
}
l = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(l, &key, slot);
iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(l, iref);
@@ -2367,11 +2756,13 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
name[total_len] = '\0';
ret = 0;
out:
+ btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
-static int btrfs_search_path_in_tree_user(struct inode *inode,
+static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -2383,7 +2774,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
unsigned long item_len;
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
+ struct btrfs_root *root = NULL;
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
@@ -2405,10 +2796,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
if (dirid != upper_limit.objectid) {
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
- key.objectid = treeid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &key);
+ root = btrfs_get_fs_root(fs_info, treeid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
@@ -2418,23 +2806,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0)
+ goto out_put;
+ else if (ret > 0) {
+ ret = -ENOENT;
+ goto out_put;
}
leaf = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(leaf, iref);
@@ -2442,7 +2823,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
total_len += len + 1;
if (ptr < args->path) {
ret = -ENAMETOOLONG;
- goto out;
+ goto out_put;
}
*(ptr + len) = '/';
@@ -2453,10 +2834,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_ITEM_KEY);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
leaf = path->nodes[0];
@@ -2464,26 +2845,27 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
btrfs_item_key_to_cpu(leaf, &key2, slot);
if (key2.objectid != dirid) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
- temp_inode = btrfs_iget(sb, &key2, root);
+ temp_inode = btrfs_iget(sb, key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
- goto out;
+ goto out_put;
}
- ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+ ret = inode_permission(mnt_userns, temp_inode,
+ MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
ret = -EACCES;
- goto out;
+ goto out_put;
}
if (key.offset == upper_limit.objectid)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
- goto out;
+ goto out_put;
}
btrfs_release_path(path);
@@ -2494,15 +2876,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
memmove(args->path, ptr, total_len);
args->path[total_len] = '\0';
+ btrfs_put_root(root);
+ root = NULL;
btrfs_release_path(path);
}
/* Get the bottom subvolume's name from ROOT_REF */
- root = fs_info->tree_root;
key.objectid = treeid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) {
@@ -2515,7 +2898,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
btrfs_item_key_to_cpu(leaf, &key, slot);
item_off = btrfs_item_ptr_offset(leaf, slot);
- item_len = btrfs_item_size_nr(leaf, slot);
+ item_len = btrfs_item_size(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
@@ -2529,30 +2912,29 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
read_extent_buffer(leaf, args->name, item_off, item_len);
args->name[item_len] = 0;
+out_put:
+ btrfs_put_root(root);
out:
btrfs_free_path(path);
return ret;
}
-static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_ino_lookup_args *args;
- struct inode *inode;
int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
return PTR_ERR(args);
- inode = file_inode(file);
-
/*
* Unprivileged query to obtain the containing subvolume root id. The
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ args->treeid = root->root_key.objectid;
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2564,7 +2946,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
goto out;
}
- ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+ ret = btrfs_search_path_in_tree(root->fs_info,
args->treeid, args->objectid,
args->name);
@@ -2610,7 +2992,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
return -EACCES;
}
- ret = btrfs_search_path_in_tree_user(inode, args);
+ ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2620,7 +3002,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
}
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
-static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
struct btrfs_ioctl_get_subvol_info_args *subvol_info;
struct btrfs_fs_info *fs_info;
@@ -2632,7 +3014,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
struct extent_buffer *leaf;
unsigned long item_off;
unsigned long item_len;
- struct inode *inode;
int slot;
int ret = 0;
@@ -2646,17 +3027,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
return -ENOMEM;
}
- inode = file_inode(file);
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &key);
+ root = btrfs_get_fs_root(fs_info, key.objectid, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
- goto out;
+ goto out_free;
}
root_item = &root->root_item;
@@ -2689,16 +3067,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
/* Search root tree for ROOT_BACKREF of this subvolume */
- root = fs_info->tree_root;
-
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(fs_info->tree_root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
@@ -2719,7 +3095,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
item_off = btrfs_item_ptr_offset(leaf, slot)
+ sizeof(struct btrfs_root_ref);
- item_len = btrfs_item_size_nr(leaf, slot)
+ item_len = btrfs_item_size(leaf, slot)
- sizeof(struct btrfs_root_ref);
read_extent_buffer(leaf, subvol_info->name,
item_off, item_len);
@@ -2733,8 +3109,10 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
ret = -EFAULT;
out:
+ btrfs_put_root(root);
+out_free:
btrfs_free_path(path);
- kzfree(subvol_info);
+ kfree(subvol_info);
return ret;
}
@@ -2742,15 +3120,14 @@ out:
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name.
*/
-static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
+ void __user *argp)
{
struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct inode *inode;
u64 objectid;
int slot;
int ret;
@@ -2766,15 +3143,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
return PTR_ERR(rootrefs);
}
- inode = file_inode(file);
- root = BTRFS_I(inode)->root->fs_info->tree_root;
- objectid = BTRFS_I(inode)->root->root_key.objectid;
-
+ objectid = root->root_key.objectid;
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
found = 0;
+ root = root->fs_info->tree_root;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
@@ -2836,7 +3211,8 @@ out:
}
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
- void __user *arg)
+ void __user *arg,
+ bool destroy_v2)
{
struct dentry *parent = file->f_path.dentry;
struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
@@ -2845,34 +3221,145 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
- struct btrfs_ioctl_vol_args *vol_args;
- int namelen;
+ struct btrfs_ioctl_vol_args *vol_args = NULL;
+ struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ char *subvol_name, *subvol_name_ptr = NULL;
+ int subvol_namelen;
int err = 0;
+ bool destroy_parent = false;
- if (!S_ISDIR(dir->i_mode))
- return -ENOTDIR;
+ /* We don't support snapshots with extent tree v2 yet. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
- vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (destroy_v2) {
+ vol_args2 = memdup_user(arg, sizeof(*vol_args2));
+ if (IS_ERR(vol_args2))
+ return PTR_ERR(vol_args2);
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- namelen = strlen(vol_args->name);
- if (strchr(vol_args->name, '/') ||
- strncmp(vol_args->name, "..", namelen) == 0) {
- err = -EINVAL;
- goto out;
+ if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /*
+ * If SPEC_BY_ID is not set, we are looking for the subvolume by
+ * name, same as v1 currently does.
+ */
+ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
+ vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ subvol_name = vol_args2->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+ } else {
+ struct inode *old_dir;
+
+ if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+
+ dentry = btrfs_get_dentry(fs_info->sb,
+ BTRFS_FIRST_FREE_OBJECTID,
+ vol_args2->subvolid, 0, 0);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_drop_write;
+ }
+
+ /*
+ * Change the default parent since the subvolume being
+ * deleted can be outside of the current mount point.
+ */
+ parent = btrfs_get_parent(dentry);
+
+ /*
+ * At this point dentry->d_name can point to '/' if the
+ * subvolume we want to destroy is outsite of the
+ * current mount point, so we need to release the
+ * current dentry and execute the lookup to return a new
+ * one with ->d_name pointing to the
+ * <mount point>/subvol_name.
+ */
+ dput(dentry);
+ if (IS_ERR(parent)) {
+ err = PTR_ERR(parent);
+ goto out_drop_write;
+ }
+ old_dir = dir;
+ dir = d_inode(parent);
+
+ /*
+ * If v2 was used with SPEC_BY_ID, a new parent was
+ * allocated since the subvolume can be outside of the
+ * current mount point. Later on we need to release this
+ * new parent dentry.
+ */
+ destroy_parent = true;
+
+ /*
+ * On idmapped mounts, deletion via subvolid is
+ * restricted to subvolumes that are immediate
+ * ancestors of the inode referenced by the file
+ * descriptor in the ioctl. Otherwise the idmapping
+ * could potentially be abused to delete subvolumes
+ * anywhere in the filesystem the user wouldn't be able
+ * to delete without an idmapped mount.
+ */
+ if (old_dir != dir && mnt_userns != &init_user_ns) {
+ err = -EOPNOTSUPP;
+ goto free_parent;
+ }
+
+ subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
+ fs_info, vol_args2->subvolid);
+ if (IS_ERR(subvol_name_ptr)) {
+ err = PTR_ERR(subvol_name_ptr);
+ goto free_parent;
+ }
+ /* subvol_name_ptr is already nul terminated */
+ subvol_name = (char *)kbasename(subvol_name_ptr);
+ }
+ } else {
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ subvol_name = vol_args->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
}
- err = mnt_want_write_file(file);
- if (err)
- goto out;
+ subvol_namelen = strlen(subvol_name);
+ if (strchr(subvol_name, '/') ||
+ strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ err = -EINVAL;
+ goto free_subvol_name;
+ }
+
+ if (!S_ISDIR(dir->i_mode)) {
+ err = -ENOTDIR;
+ goto free_subvol_name;
+ }
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
- goto out_drop_write;
- dentry = lookup_one_len(vol_args->name, parent, namelen);
+ goto free_subvol_name;
+ dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
@@ -2914,13 +3401,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (root == dest)
goto out_dput;
- err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+ err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
- err = btrfs_may_delete(dir, dentry, 1);
+ err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
if (err)
goto out_dput;
@@ -2929,21 +3416,25 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
- inode_unlock(inode);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ btrfs_inode_unlock(inode, 0);
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
out_unlock_dir:
- inode_unlock(dir);
+ btrfs_inode_unlock(dir, 0);
+free_subvol_name:
+ kfree(subvol_name_ptr);
+free_parent:
+ if (destroy_parent)
+ dput(parent);
out_drop_write:
mnt_drop_write_file(file);
out:
+ kfree(vol_args2);
kfree(vol_args);
return err;
}
@@ -2952,7 +3443,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_defrag_range_args *range;
+ struct btrfs_ioctl_defrag_range_args range = {0};
int ret;
ret = mnt_want_write_file(file);
@@ -2979,38 +3470,29 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
* running and allows defrag on files open in read-only mode.
*/
if (!capable(CAP_SYS_ADMIN) &&
- inode_permission(inode, MAY_WRITE)) {
+ inode_permission(&init_user_ns, inode, MAY_WRITE)) {
ret = -EPERM;
goto out;
}
- range = kzalloc(sizeof(*range), GFP_KERNEL);
- if (!range) {
- ret = -ENOMEM;
- goto out;
- }
-
if (argp) {
- if (copy_from_user(range, argp,
- sizeof(*range))) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
- kfree(range);
goto out;
}
/* compression requires us to start the IO */
- if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
- range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
- range->extent_thresh = (u32)-1;
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range.extent_thresh = (u32)-1;
}
} else {
/* the rest are all set to zero by kzalloc */
- range->len = (u64)-1;
+ range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), file,
- range, BTRFS_OLDEST_GENERATION, 0);
+ ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+ &range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
- kfree(range);
break;
default:
ret = -EINVAL;
@@ -3023,13 +3505,30 @@ out:
static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
struct btrfs_ioctl_vol_args *vol_args;
+ bool restore_op = false;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
- return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
+ if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+
+ /*
+ * We can do the device add because we have a paused balanced,
+ * change the exclusive op type and remember we should bring
+ * back the paused balance
+ */
+ fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
+ btrfs_exclop_start_unlock(fs_info);
+ restore_op = true;
+ }
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
@@ -3045,48 +3544,60 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ if (restore_op)
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ else
+ btrfs_exclop_finish(fs_info);
return ret;
}
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
+ struct block_device *bdev = NULL;
+ fmode_t mode;
int ret;
+ bool cancel = false;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
- goto err_drop;
- }
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
- /* Check for compatibility reject unknown flags */
- if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
+ if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- goto out;
- }
-
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
- ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
+ args.devid = vol_args->devid;
+ } else if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
} else {
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ goto out;
+
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret)
+ goto err_drop;
+
+ /* Exclusive operation is now claimed */
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
+
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3096,49 +3607,62 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
-out:
- kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
+ if (bdev)
+ blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
+ struct block_device *bdev = NULL;
+ fmode_t mode;
int ret;
+ bool cancel = false;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- goto out_drop_write;
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ if (!strcmp("cancel", vol_args->name)) {
+ cancel = true;
+ } else {
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+ if (ret)
+ goto out;
}
- vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args)) {
- ret = PTR_ERR(vol_args);
+ ret = mnt_want_write_file(file);
+ if (ret)
goto out;
- }
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret == 0) {
+ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
+ if (!ret)
+ btrfs_info(fs_info, "disk deleted %s", vol_args->name);
+ btrfs_exclop_finish(fs_info);
+ }
- if (!ret)
- btrfs_info(fs_info, "disk deleted %s", vol_args->name);
- kfree(vol_args);
-out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-out_drop_write:
mnt_drop_write_file(file);
-
+ if (bdev)
+ blkdev_put(bdev, mode);
+out:
+ btrfs_put_dev_args_from_path(&args);
+ kfree(vol_args);
return ret;
}
@@ -3148,11 +3672,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ u64 flags_in;
int ret = 0;
- fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
- if (!fi_args)
- return -ENOMEM;
+ fi_args = memdup_user(arg, sizeof(*fi_args));
+ if (IS_ERR(fi_args))
+ return PTR_ERR(fi_args);
+
+ flags_in = fi_args->flags;
+ memset(fi_args, 0, sizeof(*fi_args));
rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
@@ -3168,6 +3696,23 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
+ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
+ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
+ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
+ fi_args->generation = fs_info->generation;
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
+ }
+
+ if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
+ memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
+ sizeof(fi_args->metadata_uuid));
+ fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
+ }
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -3178,22 +3723,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
- char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
+ args.devid = di_args->devid;
if (!btrfs_is_empty_uuid(di_args->uuid))
- s_uuid = di_args->uuid;
+ args.uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL, true);
-
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
ret = -ENODEV;
goto out;
@@ -3220,733 +3764,6 @@ out:
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- }
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
-{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- int ret;
-
- /*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
- */
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
-
- return ret;
-}
-
-#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff)
-{
- int ret;
- u64 i, tail_len, chunk_count;
- struct btrfs_root *root_dst = BTRFS_I(dst)->root;
-
- spin_lock(&root_dst->root_item_lock);
- if (root_dst->send_in_progress) {
- btrfs_warn_rl(root_dst->fs_info,
-"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
- root_dst->send_in_progress);
- spin_unlock(&root_dst->root_item_lock);
- return -EAGAIN;
- }
- root_dst->dedupe_in_progress++;
- spin_unlock(&root_dst->root_item_lock);
-
- tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
- chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
-
- for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
- if (ret)
- goto out;
-
- loff += BTRFS_MAX_DEDUPE_LEN;
- dst_loff += BTRFS_MAX_DEDUPE_LEN;
- }
-
- if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst,
- dst_loff);
-out:
- spin_lock(&root_dst->root_item_lock);
- root_dst->dedupe_in_progress--;
- spin_unlock(&root_dst->root_item_lock);
-
- return ret;
-}
-
-static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
- struct inode *inode,
- u64 endoff,
- const u64 destoff,
- const u64 olen,
- int no_time_update)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
-
- inode_inc_iversion(inode);
- if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_time(inode);
- /*
- * We round up to the block size at eof when determining which
- * extents to clone above, but shouldn't round up the file size.
- */
- if (endoff > destoff + olen)
- endoff = destoff + olen;
- if (endoff > inode->i_size)
- btrfs_i_size_write(BTRFS_I(inode), endoff);
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
-}
-
-/*
- * Make sure we do not end up inserting an inline extent into a file that has
- * already other (non-inline) extents. If a file has an inline extent it can
- * not have any other extents and the (single) inline extent must start at the
- * file offset 0. Failing to respect these rules will lead to file corruption,
- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
- *
- * We can have extents that have been already written to disk or we can have
- * dirty ranges still in delalloc, in which case the extent maps and items are
- * created only when we run delalloc, and the delalloc ranges might fall outside
- * the range we are currently locking in the inode's io tree. So we check the
- * inode's i_size because of that (i_size updates are done while holding the
- * i_mutex, which we are holding here).
- * We also check to see if the inode has a size not greater than "datal" but has
- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
- * protected against such concurrent fallocate calls by the i_mutex).
- *
- * If the file has no extents but a size greater than datal, do not allow the
- * copy because we would need turn the inline extent into a non-inline one (even
- * with NO_HOLES enabled). If we find our destination inode only has one inline
- * extent, just overwrite it with the source inline extent if its size is less
- * than the source extent's size, or we could copy the source inline extent's
- * data into the destination inode's inline extent if the later is greater then
- * the former.
- */
-static int clone_copy_inline_extent(struct inode *dst,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_key *new_key,
- const u64 drop_start,
- const u64 datal,
- const u64 skip,
- const u64 size,
- char *inline_data)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
- struct btrfs_root *root = BTRFS_I(dst)->root;
- const u64 aligned_end = ALIGN(new_key->offset + datal,
- fs_info->sectorsize);
- int ret;
- struct btrfs_key key;
-
- if (new_key->offset > 0)
- return -EOPNOTSUPP;
-
- key.objectid = btrfs_ino(BTRFS_I(dst));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- goto copy_inline_extent;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
- key.type == BTRFS_EXTENT_DATA_KEY) {
- ASSERT(key.offset > 0);
- return -EOPNOTSUPP;
- }
- } else if (i_size_read(dst) <= datal) {
- struct btrfs_file_extent_item *ei;
- u64 ext_len;
-
- /*
- * If the file size is <= datal, make sure there are no other
- * extents following (can happen do to an fallocate call with
- * the flag FALLOC_FL_KEEP_SIZE).
- */
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- /*
- * If it's an inline extent, it can not have other extents
- * following it.
- */
- if (btrfs_file_extent_type(path->nodes[0], ei) ==
- BTRFS_FILE_EXTENT_INLINE)
- goto copy_inline_extent;
-
- ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- if (ext_len > aligned_end)
- return -EOPNOTSUPP;
-
- ret = btrfs_next_item(root, path);
- if (ret < 0) {
- return ret;
- } else if (ret == 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
- key.type == BTRFS_EXTENT_DATA_KEY)
- return -EOPNOTSUPP;
- }
- }
-
-copy_inline_extent:
- /*
- * We have no extent items, or we have an extent at offset 0 which may
- * or may not be inlined. All these cases are dealt the same way.
- */
- if (i_size_read(dst) > datal) {
- /*
- * If the destination inode has an inline extent...
- * This would require copying the data from the source inline
- * extent into the beginning of the destination's inline extent.
- * But this is really complex, both extents can be compressed
- * or just one of them, which would require decompressing and
- * re-compressing data (which could increase the new compressed
- * size, not allowing the compressed data to fit anymore in an
- * inline extent).
- * So just don't support this case for now (it should be rare,
- * we are not really saving space when cloning inline extents).
- */
- return -EOPNOTSUPP;
- }
-
- btrfs_release_path(path);
- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
- if (ret)
- return ret;
- ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
- return ret;
-
- if (skip) {
- const u32 start = btrfs_file_extent_calc_inline_size(0);
-
- memmove(inline_data + start, inline_data + start + skip, datal);
- }
-
- write_extent_buffer(path->nodes[0], inline_data,
- btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]),
- size);
- inode_add_bytes(dst, datal);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
-
- return 0;
-}
-
-/**
- * btrfs_clone() - clone a range from inode file to another
- *
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
- */
-static int btrfs_clone(struct inode *src, struct inode *inode,
- const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path = NULL;
- struct extent_buffer *leaf;
- struct btrfs_trans_handle *trans;
- char *buf = NULL;
- struct btrfs_key key;
- u32 nritems;
- int slot;
- int ret;
- const u64 len = olen_aligned;
- u64 last_dest_end = destoff;
-
- ret = -ENOMEM;
- buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
- if (!buf)
- return ret;
-
- path = btrfs_alloc_path();
- if (!path) {
- kvfree(buf);
- return ret;
- }
-
- path->reada = READA_FORWARD;
- /* clone data */
- key.objectid = btrfs_ino(BTRFS_I(src));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = off;
-
- while (1) {
- u64 next_key_min_offset = key.offset + 1;
- struct btrfs_file_extent_item *extent;
- int type;
- u32 size;
- struct btrfs_key new_key;
- u64 disko = 0, diskl = 0;
- u64 datao = 0, datal = 0;
- u8 comp;
- u64 drop_start;
-
- /*
- * note the key will change type as we walk through the
- * tree.
- */
- path->leave_spinning = 1;
- ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
- 0, 0);
- if (ret < 0)
- goto out;
- /*
- * First search, if no extent item that starts at offset off was
- * found but the previous item is an extent item, it's possible
- * it might overlap our target range, therefore process it.
- */
- if (key.offset == off && ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0] - 1);
- if (key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
- nritems = btrfs_header_nritems(path->nodes[0]);
-process_slot:
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- nritems = btrfs_header_nritems(path->nodes[0]);
- }
- leaf = path->nodes[0];
- slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.type > BTRFS_EXTENT_DATA_KEY ||
- key.objectid != btrfs_ino(BTRFS_I(src)))
- break;
-
- ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- comp = btrfs_file_extent_compression(leaf, extent);
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- disko = btrfs_file_extent_disk_bytenr(leaf, extent);
- diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
- datao = btrfs_file_extent_offset(leaf, extent);
- datal = btrfs_file_extent_num_bytes(leaf, extent);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- /* Take upper bound, may be compressed */
- datal = btrfs_file_extent_ram_bytes(leaf, extent);
- }
-
- /*
- * The first search might have left us at an extent item that
- * ends before our target range's start, can happen if we have
- * holes and NO_HOLES feature enabled.
- */
- if (key.offset + datal <= off) {
- path->slots[0]++;
- goto process_slot;
- } else if (key.offset >= off + len) {
- break;
- }
- next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
- read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
- size);
-
- btrfs_release_path(path);
- path->leave_spinning = 0;
-
- memcpy(&new_key, &key, sizeof(new_key));
- new_key.objectid = btrfs_ino(BTRFS_I(inode));
- if (off <= key.offset)
- new_key.offset = key.offset + destoff - off;
- else
- new_key.offset = destoff;
-
- /*
- * Deal with a hole that doesn't have an extent item that
- * represents it (NO_HOLES feature enabled).
- * This hole is either in the middle of the cloning range or at
- * the beginning (fully overlaps it or partially overlaps it).
- */
- if (new_key.offset != last_dest_end)
- drop_start = last_dest_end;
- else
- drop_start = new_key.offset;
-
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
-
- /*
- * a | --- range to clone ---| b
- * | ------------- extent ------------- |
- */
-
- /* Subtract range b */
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
- /* Subtract range a */
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- clone_info.disk_offset = disko;
- clone_info.disk_len = diskl;
- clone_info.data_offset = datao;
- clone_info.data_len = datal;
- clone_info.file_offset = new_key.offset;
- clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path,
- drop_start,
- new_key.offset + datal - 1,
- &clone_info, &trans);
- if (ret)
- goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 skip = 0;
- u64 trim = 0;
-
- if (off > key.offset) {
- skip = off - key.offset;
- new_key.offset += skip;
- }
-
- if (key.offset + datal > off + len)
- trim = key.offset + datal - (off + len);
-
- if (comp && (skip || trim)) {
- ret = -EINVAL;
- goto out;
- }
- size -= skip + trim;
- datal -= skip + trim;
-
- /*
- * If our extent is inline, we know we will drop or
- * adjust at most 1 extent item in the destination root.
- *
- * 1 - adjusting old extent (we may have to split it)
- * 1 - add new extent
- * 1 - inode update
- */
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
- ret = clone_copy_inline_extent(inode, trans, path,
- &new_key, drop_start,
- datal, skip, size, buf);
- if (ret) {
- if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- }
-
- btrfs_release_path(path);
-
- last_dest_end = ALIGN(new_key.offset + datal,
- fs_info->sectorsize);
- ret = clone_finish_inode_update(trans, inode, last_dest_end,
- destoff, olen, no_time_update);
- if (ret)
- goto out;
- if (new_key.offset + datal >= destoff + len)
- break;
-
- btrfs_release_path(path);
- key.offset = next_key_min_offset;
-
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
- }
- ret = 0;
-
- if (last_dest_end < destoff + len) {
- /*
- * We have an implicit hole that fully or partially overlaps our
- * cloning range at its end. This means that we either have the
- * NO_HOLES feature enabled or the implicit hole happened due to
- * mixing buffered and direct IO writes against this file.
- */
- btrfs_release_path(path);
- path->leave_spinning = 0;
-
- ret = btrfs_punch_hole_range(inode, path,
- last_dest_end, destoff + len - 1,
- NULL, &trans);
- if (ret)
- goto out;
-
- ret = clone_finish_inode_update(trans, inode, destoff + len,
- destoff, olen, no_time_update);
- }
-
-out:
- btrfs_free_path(path);
- kvfree(buf);
- return ret;
-}
-
-static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
- u64 off, u64 olen, u64 destoff)
-{
- struct inode *inode = file_inode(file);
- struct inode *src = file_inode(file_src);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret;
- u64 len = olen;
- u64 bs = fs_info->sb->s_blocksize;
-
- /*
- * TODO:
- * - split compressed inline extents. annoying: we need to
- * decompress into destination's address_space (the file offset
- * may change, so source mapping won't do), then recompress (or
- * otherwise reinsert) a subrange.
- *
- * - split destination inode's inline extents. The inline extents can
- * be either compressed or non-compressed.
- */
-
- /*
- * VFS's generic_remap_file_range_prep() protects us from cloning the
- * eof block into the middle of a file, which would result in corruption
- * if the file size is not blocksize aligned. So we don't need to check
- * for that case here.
- */
- if (off + len == src->i_size)
- len = ALIGN(src->i_size, bs) - off;
-
- if (destoff > inode->i_size) {
- const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
-
- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
- if (ret)
- return ret;
- /*
- * We may have truncated the last block if the inode's size is
- * not sector size aligned, so we need to wait for writeback to
- * complete before proceeding further, otherwise we can race
- * with cloning and attempt to increment a reference to an
- * extent that no longer exists (writeback completed right after
- * we found the previous extent covering eof and before we
- * attempted to increment its reference count).
- */
- ret = btrfs_wait_ordered_range(inode, wb_start,
- destoff - wb_start);
- if (ret)
- return ret;
- }
-
- /*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
- */
- btrfs_double_extent_lock(src, off, inode, destoff, len);
- ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
- /*
- * Truncate page cache pages so that future reads will see the cloned
- * data immediately and not the previous data.
- */
- truncate_inode_pages_range(&inode->i_data,
- round_down(destoff, PAGE_SIZE),
- round_up(destoff + len, PAGE_SIZE) - 1);
-
- return ret;
-}
-
-static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
- u64 wb_len;
- int ret;
-
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
-
- if (btrfs_root_readonly(root_out))
- return -EROFS;
-
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
- }
-
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
- return -EINVAL;
- }
-
- /*
- * Now that the inodes are locked, we need to start writeback ourselves
- * and can not rely on the writeback from the VFS's generic helper
- * generic_remap_file_range_prep() because:
- *
- * 1) For compression we must call filemap_fdatawrite_range() range
- * twice (btrfs_fdatawrite_range() does it for us), and the generic
- * helper only calls it once;
- *
- * 2) filemap_fdatawrite_range(), called by the generic helper only
- * waits for the writeback to complete, i.e. for IO to be done, and
- * not for the ordered extents to complete. We need to wait for them
- * to complete so that new file extent items are in the fs tree.
- */
- if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
- else
- wb_len = ALIGN(*len, bs);
-
- /*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
- * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
- *
- * Btrfs' back references do not have a block level granularity, they
- * work at the whole extent level.
- * NOCOW buffered write without data space reserved may not be able
- * to fall back to CoW due to lack of data space, thus could cause
- * data loss.
- *
- * Here we take a shortcut by flushing the whole inode, so that all
- * nocow write should reach disk as nocow before we increase the
- * reference of the extent. We could do better by only flushing NOCOW
- * data, but that needs extra accounting.
- *
- * Also we don't need to check ASYNC_EXTENT, as async extent will be
- * CoWed anyway, not affecting nocow part.
- */
- ret = filemap_flush(inode_in->i_mapping);
- if (ret < 0)
- return ret;
-
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
- wb_len);
- if (ret < 0)
- return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
- wb_len);
- if (ret < 0)
- return ret;
-
- return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
-}
-
-loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, loff_t len,
- unsigned int remap_flags)
-{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
- bool same_inode = dst_inode == src_inode;
- int ret;
-
- if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
- return -EINVAL;
-
- if (same_inode)
- inode_lock(src_inode);
- else
- lock_two_nondirectories(src_inode, dst_inode);
-
- ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
- &len, remap_flags);
- if (ret < 0 || len == 0)
- goto out_unlock;
-
- if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
- else
- ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
-
-out_unlock:
- if (same_inode)
- inode_unlock(src_inode);
- else
- unlock_two_nondirectories(src_inode, dst_inode);
-
- return ret < 0 ? ret : len;
-}
-
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
@@ -3955,8 +3772,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
- struct btrfs_key location;
+ struct btrfs_path *path = NULL;
struct btrfs_disk_key disk_key;
u64 objectid = 0;
u64 dir_id;
@@ -3977,53 +3793,50 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
if (!objectid)
objectid = BTRFS_FS_TREE_OBJECTID;
- location.objectid = objectid;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
-
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ new_root = btrfs_get_fs_root(fs_info, objectid, true);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
goto out;
}
if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
- goto out;
+ goto out_free;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out;
+ goto out_free;
}
- path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_free_path(path);
ret = PTR_ERR(trans);
- goto out;
+ goto out_free;
}
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
- btrfs_free_path(path);
+ btrfs_release_path(path);
btrfs_end_transaction(trans);
btrfs_err(fs_info,
"Umm, you don't have the default diritem, this isn't going to work");
ret = -ENOENT;
- goto out;
+ goto out_free;
}
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_free_path(path);
+ btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
btrfs_end_transaction(trans);
+out_free:
+ btrfs_put_root(new_root);
+ btrfs_free_path(path);
out:
mnt_drop_write_file(file);
return ret;
@@ -4074,15 +3887,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -4130,15 +3940,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -4192,7 +3999,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
{
struct btrfs_trans_handle *trans;
u64 transid;
- int ret;
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -4204,11 +4010,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
goto out;
}
transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ btrfs_commit_transaction_async(trans);
out:
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -4239,6 +4041,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
@@ -4338,6 +4145,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
p = memdup_user(arg, sizeof(*p));
if (IS_ERR(p))
return PTR_ERR(p);
@@ -4348,11 +4160,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -4434,26 +4246,6 @@ out:
return ret;
}
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
-{
- struct btrfs_data_container *inodes = ctx;
- const size_t c = 3 * sizeof(u64);
-
- if (inodes->bytes_left >= c) {
- inodes->bytes_left -= c;
- inodes->val[inodes->elem_cnt] = inum;
- inodes->val[inodes->elem_cnt + 1] = offset;
- inodes->val[inodes->elem_cnt + 2] = root;
- inodes->elem_cnt += 3;
- } else {
- inodes->bytes_missing += c - inodes->bytes_left;
- inodes->bytes_left = 0;
- inodes->elem_missed += 3;
- }
-
- return 0;
-}
-
static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
void __user *arg, int version)
{
@@ -4503,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
}
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- build_ino_list, inodes, ignore_offset);
+ inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4546,13 +4338,79 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
+/**
+ * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
+ * required.
+ *
+ * @fs_info: the filesystem
+ * @excl_acquired: ptr to boolean value which is set to false in case balance
+ * is being resumed
+ *
+ * Return 0 on success in which case both fs_info::balance is acquired as well
+ * as exclusive ops are blocked. In case of failure return an error code.
+ */
+static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
+{
+ int ret;
+
+ /*
+ * Exclusive operation is locked. Three possibilities:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
+ while (1) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ *excl_acquired = true;
+ mutex_lock(&fs_info->balance_mutex);
+ return 0;
+ }
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* This is either (2) or (3) */
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (2) */
+ ret = -EINPROGRESS;
+ goto out_failure;
+
+ } else {
+ mutex_unlock(&fs_info->balance_mutex);
+ /*
+ * Lock released to allow other waiters to
+ * continue, we'll reexamine the status again.
+ */
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (3) */
+ *excl_acquired = false;
+ return 0;
+ }
+ }
+ } else {
+ /* This is (1) */
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_failure;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ }
+
+out_failure:
+ mutex_unlock(&fs_info->balance_mutex);
+ *excl_acquired = false;
+ return ret;
+}
+
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
- bool need_unlock; /* for mut. excl. ops lock */
+ bool need_unlock = true;
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -4562,133 +4420,80 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
-again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
- mutex_lock(&fs_info->balance_mutex);
- need_unlock = true;
- goto locked;
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ bargs = NULL;
+ goto out;
}
- /*
- * mut. excl. ops lock is locked. Three possibilities:
- * (1) some other op is running
- * (2) balance is running
- * (3) balance is paused -- special case (think resume)
- */
- mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl) {
- /* this is either (2) or (3) */
- if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
- /*
- * Lock released to allow other waiters to continue,
- * we'll reexamine the status again.
- */
- mutex_lock(&fs_info->balance_mutex);
-
- if (fs_info->balance_ctl &&
- !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- /* this is (3) */
- need_unlock = false;
- goto locked;
- }
-
- mutex_unlock(&fs_info->balance_mutex);
- goto again;
- } else {
- /* this is (2) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = -EINPROGRESS;
- goto out;
- }
- } else {
- /* this is (1) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ ret = btrfs_try_lock_balance(fs_info, &need_unlock);
+ if (ret)
goto out;
- }
-locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+ lockdep_assert_held(&fs_info->balance_mutex);
- if (arg) {
- bargs = memdup_user(arg, sizeof(*bargs));
- if (IS_ERR(bargs)) {
- ret = PTR_ERR(bargs);
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
goto out_unlock;
}
- if (bargs->flags & BTRFS_BALANCE_RESUME) {
- if (!fs_info->balance_ctl) {
- ret = -ENOTCONN;
- goto out_bargs;
- }
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
- bctl = fs_info->balance_ctl;
- spin_lock(&fs_info->balance_lock);
- bctl->flags |= BTRFS_BALANCE_RESUME;
- spin_unlock(&fs_info->balance_lock);
+ goto do_balance;
+ }
- goto do_balance;
- }
- } else {
- bargs = NULL;
+ if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_unlock;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
- goto out_bargs;
+ goto out_unlock;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
- goto out_bargs;
- }
-
- if (arg) {
- memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
- memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
- memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-
- bctl->flags = bargs->flags;
- } else {
- /* balance everything - no filters */
- bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ goto out_unlock;
}
- if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
- ret = -EINVAL;
- goto out_bctl;
- }
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+ bctl->flags = bargs->flags;
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if ((ret == 0 || ret == -ECANCELED) && arg) {
+ if (ret == 0 || ret == -ECANCELED) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
-out_bctl:
kfree(bctl);
-out_bargs:
- kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
+ kfree(bargs);
return ret;
}
@@ -4966,26 +4771,20 @@ drop_write:
static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_quota_rescan_args *qsa;
- int ret = 0;
+ struct btrfs_ioctl_quota_rescan_args qsa = {0};
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
- if (!qsa)
- return -ENOMEM;
-
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- qsa->flags = 1;
- qsa->progress = fs_info->qgroup_rescan_progress.objectid;
+ qsa.flags = 1;
+ qsa.progress = fs_info->qgroup_rescan_progress.objectid;
}
- if (copy_to_user(arg, qsa, sizeof(*qsa)))
- ret = -EFAULT;
+ if (copy_to_user(arg, &qsa, sizeof(qsa)))
+ return -EFAULT;
- kfree(qsa);
- return ret;
+ return 0;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
@@ -4998,6 +4797,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct user_namespace *mnt_userns,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
@@ -5009,7 +4809,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
int ret = 0;
int received_uuid_changed;
- if (!inode_owner_or_capable(inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
ret = mnt_want_write_file(file);
@@ -5114,7 +4914,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
args64->rtime.nsec = args32->rtime.nsec;
args64->flags = args32->flags;
- ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
if (ret)
goto out;
@@ -5148,7 +4948,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = _btrfs_ioctl_set_received_subvol(file, sa);
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
if (ret)
goto out;
@@ -5400,7 +5200,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -5430,11 +5230,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(file, arg);
+ ret = btrfs_ioctl_send(inode, arg);
kfree(arg);
return ret;
}
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+ bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+ flags);
+ size_t copy_end;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+ flags);
+ if (copy_from_user(&args32, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+ if (args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ if (ret >= 0) {
+ fsnotify_access(file);
+ if (copy_to_user(argp + copy_end,
+ (char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel))
+ ret = -EFAULT;
+ }
+
+out_iov:
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, argp, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+ args.len = args32.len;
+ args.unencoded_len = args32.unencoded_len;
+ args.unencoded_offset = args32.unencoded_offset;
+ args.compression = args32.compression;
+ args.encryption = args32.encryption;
+ memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ ret = -EINVAL;
+ if (args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
+ goto out_acct;
+ if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (args.unencoded_offset > args.unencoded_len)
+ goto out_acct;
+ if (args.len > args.unencoded_len - args.unencoded_offset)
+ goto out_acct;
+
+ ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ file_start_write(file);
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_end_write;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, args.len);
+ if (ret < 0)
+ goto out_end_write;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0);
+ if (ret)
+ goto out_end_write;
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_do_write_iter(&kiocb, &iter, &args);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+out_end_write:
+ file_end_write(file);
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -5444,12 +5427,8 @@ long btrfs_ioctl(struct file *file, unsigned int
void __user *argp = (void __user *)arg;
switch (cmd) {
- case FS_IOC_GETFLAGS:
- return btrfs_ioctl_getflags(file, argp);
- case FS_IOC_SETFLAGS:
- return btrfs_ioctl_setflags(file, argp);
case FS_IOC_GETVERSION:
- return btrfs_ioctl_getversion(file, argp);
+ return btrfs_ioctl_getversion(inode, argp);
case FS_IOC_GETFSLABEL:
return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
@@ -5465,9 +5444,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SUBVOL_CREATE_V2:
return btrfs_ioctl_snap_create_v2(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
- return btrfs_ioctl_snap_destroy(file, argp);
+ return btrfs_ioctl_snap_destroy(file, argp, false);
+ case BTRFS_IOC_SNAP_DESTROY_V2:
+ return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(file, argp);
+ return btrfs_ioctl_subvol_getflags(inode, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5488,14 +5469,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_fs_info(fs_info, argp);
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
- case BTRFS_IOC_BALANCE:
- return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(file, argp);
+ return btrfs_ioctl_tree_search(inode, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(file, argp);
+ return btrfs_ioctl_tree_search_v2(inode, argp);
case BTRFS_IOC_INO_LOOKUP:
- return btrfs_ioctl_ino_lookup(file, argp);
+ return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
@@ -5507,7 +5486,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
@@ -5542,10 +5521,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(file, argp, false);
+ return _btrfs_ioctl_send(inode, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(file, argp, true);
+ return _btrfs_ioctl_send(inode, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
@@ -5571,16 +5550,26 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
- case FS_IOC_FSGETXATTR:
- return btrfs_ioctl_fsgetxattr(file, argp);
- case FS_IOC_FSSETXATTR:
- return btrfs_ioctl_fssetxattr(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
- return btrfs_ioctl_get_subvol_info(file, argp);
+ return btrfs_ioctl_get_subvol_info(inode, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
- return btrfs_ioctl_get_subvol_rootref(file, argp);
+ return btrfs_ioctl_get_subvol_rootref(root, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
+ case FS_IOC_ENABLE_VERITY:
+ return fsverity_ioctl_enable(file, (const void __user *)argp);
+ case FS_IOC_MEASURE_VERITY:
+ return fsverity_ioctl_measure(file, argp);
+ case BTRFS_IOC_ENCODED_READ:
+ return btrfs_ioctl_encoded_read(file, argp, false);
+ case BTRFS_IOC_ENCODED_WRITE:
+ return btrfs_ioctl_encoded_write(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+ return btrfs_ioctl_encoded_read(file, argp, true);
+ case BTRFS_IOC_ENCODED_WRITE_32:
+ return btrfs_ioctl_encoded_write(file, argp, true);
+#endif
}
return -ENOTTY;
@@ -5594,12 +5583,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
* handling is necessary.
*/
switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 571c4826c428..0eab3cb274a1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,488 +14,203 @@
#include "locking.h"
/*
- * Extent buffer locking
- * =====================
- *
- * The locks use a custom scheme that allows to do more operations than are
- * available fromt current locking primitives. The building blocks are still
- * rwlock and wait queues.
- *
- * Required semantics:
- *
- * - reader/writer exclusion
- * - writer/writer exclusion
- * - reader/reader sharing
- * - spinning lock semantics
- * - blocking lock semantics
- * - try-lock semantics for readers and writers
- * - one level nesting, allowing read lock to be taken by the same thread that
- * already has write lock
- *
- * The extent buffer locks (also called tree locks) manage access to eb data
- * related to the storage in the b-tree (keys, items, but not the individual
- * members of eb).
- * We want concurrency of many readers and safe updates. The underlying locking
- * is done by read-write spinlock and the blocking part is implemented using
- * counters and wait queues.
- *
- * spinning semantics - the low-level rwlock is held so all other threads that
- * want to take it are spinning on it.
- *
- * blocking semantics - the low-level rwlock is not held but the counter
- * denotes how many times the blocking lock was held;
- * sleeping is possible
- *
- * Write lock always allows only one thread to access the data.
- *
- *
- * Debugging
- * ---------
- *
- * There are additional state counters that are asserted in various contexts,
- * removed from non-debug build to reduce extent_buffer size and for
- * performance reasons.
- *
- *
- * Lock nesting
- * ------------
- *
- * A write operation on a tree might indirectly start a look up on the same
- * tree. This can happen when btrfs_cow_block locks the tree and needs to
- * lookup free extents.
- *
- * btrfs_cow_block
- * ..
- * alloc_tree_block_no_bg_flush
- * btrfs_alloc_tree_block
- * btrfs_reserve_extent
- * ..
- * load_free_space_cache
- * ..
- * btrfs_lookup_file_extent
- * btrfs_search_slot
- *
- *
- * Locking pattern - spinning
- * --------------------------
- *
- * The simple locking scenario, the +--+ denotes the spinning section.
- *
- * +- btrfs_tree_lock
- * | - extent_buffer::rwlock is held
- * | - no heavy operations should happen, eg. IO, memory allocations, large
- * | structure traversals
- * +- btrfs_tree_unock
-*
-*
- * Locking pattern - blocking
- * --------------------------
- *
- * The blocking write uses the following scheme. The +--+ denotes the spinning
- * section.
- *
- * +- btrfs_tree_lock
- * |
- * +- btrfs_set_lock_blocking_write
- *
- * - allowed: IO, memory allocations, etc.
- *
- * -- btrfs_tree_unlock - note, no explicit unblocking necessary
- *
- *
- * Blocking read is similar.
- *
- * +- btrfs_tree_read_lock
- * |
- * +- btrfs_set_lock_blocking_read
- *
- * - heavy operations allowed
- *
- * +- btrfs_tree_read_unlock_blocking
- * |
- * +- btrfs_tree_read_unlock
- *
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
+ *
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
+ *
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
*/
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if BTRFS_MAX_LEVEL != 8
+#error
+#endif
-#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers);
- eb->spinning_writers++;
-}
-
-static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers != 1);
- eb->spinning_writers--;
-}
-
-static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers);
-}
-
-static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
-{
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ /* Longest entry: btrfs-free-space-00 */
+ char names[BTRFS_MAX_LEVEL][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = 0, DEFINE_NAME("tree") },
+};
+
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level)
{
- atomic_inc(&eb->read_locks);
-}
+ struct btrfs_lockdep_keyset *ks;
-static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
-{
- atomic_dec(&eb->read_locks);
-}
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
-static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
-{
- BUG_ON(!atomic_read(&eb->read_locks));
-}
+ /* Find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
-static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
-{
- eb->write_locks++;
+ lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
}
-static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
{
- eb->write_locks--;
+ if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ eb, btrfs_header_level(eb));
}
-#else
-static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
#endif
/*
- * Mark already held read lock as blocking. Can be nested in write lock by the
- * same thread.
+ * Extent buffer locking
+ * =====================
*
- * Use when there are potentially long operations ahead so other thread waiting
- * on the lock will not actively spin but sleep instead.
+ * We use a rw_semaphore for tree locking, and the semantics are exactly the
+ * same:
*
- * The rwlock is released and blocking reader counter is increased.
- */
-void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
-{
- trace_btrfs_set_lock_blocking_read(eb);
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_nested && current->pid == eb->lock_owner)
- return;
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- btrfs_assert_spinning_readers_put(eb);
- read_unlock(&eb->lock);
-}
-
-/*
- * Mark already held write lock as blocking.
- *
- * Use when there are potentially long operations ahead so other threads
- * waiting on the lock will not actively spin but sleep instead.
+ * - reader/writer exclusion
+ * - writer/writer exclusion
+ * - reader/reader sharing
+ * - try-lock semantics for readers and writers
*
- * The rwlock is released and blocking writers is set.
+ * The rwsem implementation does opportunistic spinning which reduces number of
+ * times the locking task needs to sleep.
*/
-void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
-{
- trace_btrfs_set_lock_blocking_write(eb);
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_nested && current->pid == eb->lock_owner)
- return;
- if (eb->blocking_writers == 0) {
- btrfs_assert_spinning_writers_put(eb);
- btrfs_assert_tree_locked(eb);
- WRITE_ONCE(eb->blocking_writers, 1);
- write_unlock(&eb->lock);
- }
-}
/*
- * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
- * Can be nested in write lock by the same thread.
+ * __btrfs_tree_read_lock - lock extent buffer for read
+ * @eb: the eb to be locked
+ * @nest: the nesting level to be used for lockdep
*
- * Use when the locked section does only lightweight actions and busy waiting
- * would be cheaper than making other threads do the wait/wake loop.
- *
- * The rwlock is held upon exit.
+ * This takes the read lock on the extent buffer, using the specified nesting
+ * level for lockdep purposes.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
if (trace_btrfs_tree_read_lock_enabled())
start_ns = ktime_get_ns();
-again:
- read_lock(&eb->lock);
- BUG_ON(eb->blocking_writers == 0 &&
- current->pid == eb->lock_owner);
- if (eb->blocking_writers) {
- if (current->pid == eb->lock_owner) {
- /*
- * This extent is already write-locked by our thread.
- * We allow an additional read lock to be added because
- * it's for the same thread. btrfs_find_all_roots()
- * depends on this as it may be called on a partly
- * (write-)locked tree.
- */
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
- read_unlock(&eb->lock);
- trace_btrfs_tree_read_lock(eb, start_ns);
- return;
- }
- read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- READ_ONCE(eb->blocking_writers) == 0);
- goto again;
- }
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
+
+ down_read_nested(&eb->lock, nest);
trace_btrfs_tree_read_lock(eb, start_ns);
}
-/*
- * Lock extent buffer for read, optimistically expecting that there are no
- * contending blocking writers. If there are, don't wait.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+void btrfs_tree_read_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers))
- return 0;
-
- read_lock(&eb->lock);
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers)) {
- read_unlock(&eb->lock);
- return 0;
- }
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
- trace_btrfs_tree_read_lock_atomic(eb);
- return 1;
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
}
/*
- * Try-lock for read. Don't block or wait for contending writers.
+ * Try-lock for read.
*
- * Retrun 1 if the rwlock has been taken, 0 otherwise
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers))
- return 0;
-
- if (!read_trylock(&eb->lock))
- return 0;
-
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers)) {
- read_unlock(&eb->lock);
- return 0;
+ if (down_read_trylock(&eb->lock)) {
+ trace_btrfs_try_tree_read_lock(eb);
+ return 1;
}
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
- trace_btrfs_try_tree_read_lock(eb);
- return 1;
+ return 0;
}
/*
- * Try-lock for write. May block until the lock is uncontended, but does not
- * wait until it is free.
+ * Try-lock for write.
*
- * Retrun 1 if the rwlock has been taken, 0 otherwise
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
- return 0;
-
- write_lock(&eb->lock);
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
- write_unlock(&eb->lock);
- return 0;
+ if (down_write_trylock(&eb->lock)) {
+ eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_write_lock(eb);
+ return 1;
}
- btrfs_assert_tree_write_locks_get(eb);
- btrfs_assert_spinning_writers_get(eb);
- eb->lock_owner = current->pid;
- trace_btrfs_try_tree_write_lock(eb);
- return 1;
+ return 0;
}
/*
- * Release read lock. Must be used only if the lock is in spinning mode. If
- * the read lock is nested, must pair with read lock before the write unlock.
- *
- * The rwlock is not held upon exit.
+ * Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- /*
- * if we're nested, we have the write lock. No new locking
- * is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
- * field only matters to the lock owner.
- */
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
- return;
- }
- btrfs_assert_tree_read_locked(eb);
- btrfs_assert_spinning_readers_put(eb);
- btrfs_assert_tree_read_locks_put(eb);
- read_unlock(&eb->lock);
+ up_read(&eb->lock);
}
/*
- * Release read lock, previously set to blocking by a pairing call to
- * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
- * thread.
+ * __btrfs_tree_lock - lock eb for write
+ * @eb: the eb to lock
+ * @nest: the nesting to use for the lock
*
- * State of rwlock is unchanged, last reader wakes waiting threads.
+ * Returns with the eb->lock write locked.
*/
-void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
-{
- trace_btrfs_tree_read_unlock_blocking(eb);
- /*
- * if we're nested, we have the write lock. No new locking
- * is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
- * field only matters to the lock owner.
- */
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
- return;
- }
- btrfs_assert_tree_read_locked(eb);
- WARN_ON(atomic_read(&eb->blocking_readers) == 0);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- btrfs_assert_tree_read_locks_put(eb);
-}
-
-/*
- * Lock for write. Wait for all blocking and spinning readers and writers. This
- * starts context where reader lock could be nested by the same thread.
- *
- * The rwlock is held for write upon exit.
- */
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
+ __acquires(&eb->lock)
{
u64 start_ns = 0;
if (trace_btrfs_tree_lock_enabled())
start_ns = ktime_get_ns();
- WARN_ON(eb->lock_owner == current->pid);
-again:
- wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
- wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
- write_lock(&eb->lock);
- /* Refetch value after lock */
- if (atomic_read(&eb->blocking_readers) ||
- READ_ONCE(eb->blocking_writers)) {
- write_unlock(&eb->lock);
- goto again;
- }
- btrfs_assert_spinning_writers_get(eb);
- btrfs_assert_tree_write_locks_get(eb);
+ down_write_nested(&eb->lock, nest);
eb->lock_owner = current->pid;
trace_btrfs_tree_lock(eb, start_ns);
}
-/*
- * Release the write lock, either blocking or spinning (ie. there's no need
- * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
- * This also ends the context for nesting, the read lock must have been
- * released already.
- *
- * Tasks blocked and waiting are woken, rwlock is not held upon exit.
- */
-void btrfs_tree_unlock(struct extent_buffer *eb)
+void btrfs_tree_lock(struct extent_buffer *eb)
{
- /*
- * This is read both locked and unlocked but always by the same thread
- * that already owns the lock so we don't need to use READ_ONCE
- */
- int blockers = eb->blocking_writers;
-
- BUG_ON(blockers > 1);
-
- btrfs_assert_tree_locked(eb);
- trace_btrfs_tree_unlock(eb);
- eb->lock_owner = 0;
- btrfs_assert_tree_write_locks_put(eb);
-
- if (blockers) {
- btrfs_assert_no_spinning_writers(eb);
- /* Unlocked write */
- WRITE_ONCE(eb->blocking_writers, 0);
- /*
- * We need to order modifying blocking_writers above with
- * actually waking up the sleepers to ensure they see the
- * updated value of blocking_writers
- */
- cond_wake_up(&eb->write_lock_wq);
- } else {
- btrfs_assert_spinning_writers_put(eb);
- write_unlock(&eb->lock);
- }
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
}
/*
- * Set all locked nodes in the path to blocking locks. This should be done
- * before scheduling
+ * Release the write lock.
*/
-void btrfs_set_path_blocking(struct btrfs_path *p)
+void btrfs_tree_unlock(struct extent_buffer *eb)
{
- int i;
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
+ trace_btrfs_tree_unlock(eb);
+ eb->lock_owner = 0;
+ up_write(&eb->lock);
}
/*
@@ -523,3 +238,167 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
path->locks[i] = 0;
}
}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with write lock held
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
+ btrfs_tree_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with read lock held
+ */
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
+ btrfs_tree_read_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree in
+ * nowait mode until we end up with a lock on the root node or returning to
+ * avoid blocking.
+ *
+ * Return: root extent buffer with read lock held or -EAGAIN.
+ */
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ if (!btrfs_try_tree_read_lock(eb)) {
+ free_extent_buffer(eb);
+ return ERR_PTR(-EAGAIN);
+ }
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * DREW locks
+ * ==========
+ *
+ * DREW stands for double-reader-writer-exclusion lock. It's used in situation
+ * where you want to provide A-B exclusion but not AA or BB.
+ *
+ * Currently implementation gives more priority to reader. If a reader and a
+ * writer both race to acquire their respective sides of the lock the writer
+ * would yield its lock as soon as it detects a concurrent reader. Additionally
+ * if there are pending readers no new writers would be allowed to come in and
+ * acquire the lock.
+ */
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+{
+ int ret;
+
+ ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ atomic_set(&lock->readers, 0);
+ init_waitqueue_head(&lock->pending_readers);
+ init_waitqueue_head(&lock->pending_writers);
+
+ return 0;
+}
+
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_destroy(&lock->writers);
+}
+
+/* Return true if acquisition is successful, false otherwise */
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
+{
+ if (atomic_read(&lock->readers))
+ return false;
+
+ percpu_counter_inc(&lock->writers);
+
+ /* Ensure writers count is updated before we check for pending readers */
+ smp_mb();
+ if (atomic_read(&lock->readers)) {
+ btrfs_drew_write_unlock(lock);
+ return false;
+ }
+
+ return true;
+}
+
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
+{
+ while (true) {
+ if (btrfs_drew_try_write_lock(lock))
+ return;
+ wait_event(lock->pending_writers, !atomic_read(&lock->readers));
+ }
+}
+
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_dec(&lock->writers);
+ cond_wake_up(&lock->pending_readers);
+}
+
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
+{
+ atomic_inc(&lock->readers);
+
+ /*
+ * Ensure the pending reader count is perceieved BEFORE this reader
+ * goes to sleep in case of active writers. This guarantees new writers
+ * won't be allowed and that the current reader will be woken up when
+ * the last active writer finishes its jobs.
+ */
+ smp_mb__after_atomic();
+
+ wait_event(lock->pending_readers,
+ percpu_counter_sum(&lock->writers) == 0);
+}
+
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
+{
+ /*
+ * atomic_dec_and_test implies a full barrier, so woken up writers
+ * are guaranteed to see the decrement
+ */
+ if (atomic_dec_and_test(&lock->readers))
+ wake_up(&lock->pending_writers);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 21a285883e89..490c7a79e995 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -6,46 +6,144 @@
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
#include "extent_io.h"
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
-#define BTRFS_WRITE_LOCK_BLOCKING 3
-#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
+struct btrfs_path;
+
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
-void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
-int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- BUG_ON(!eb->write_locks);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_write(&eb->lock);
}
#else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
#endif
-void btrfs_set_path_blocking(struct btrfs_path *p);
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
- if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+ if (rw == BTRFS_WRITE_LOCK)
btrfs_tree_unlock(eb);
- else if (rw == BTRFS_READ_LOCK_BLOCKING)
- btrfs_tree_read_unlock_blocking(eb);
else if (rw == BTRFS_READ_LOCK)
btrfs_tree_read_unlock(eb);
else
BUG();
}
+struct btrfs_drew_lock {
+ atomic_t readers;
+ struct percpu_counter writers;
+ wait_queue_head_t pending_writers;
+ wait_queue_head_t pending_readers;
+};
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level);
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb);
+#else
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
+{
+}
+static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+}
+#endif
+
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index aa9cd11f4b78..89bc5f825e0a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -14,6 +14,7 @@
#include <linux/lzo.h>
#include <linux/refcount.h>
#include "compression.h"
+#include "ctree.h"
#define LZO_LEN 4
@@ -31,19 +32,19 @@
* payload.
* One regular LZO compressed extent can have one or more segments.
* For inlined LZO compressed extent, only one segment is allowed.
- * One segment represents at most one page of uncompressed data.
+ * One segment represents at most one sector of uncompressed data.
*
* 2.1 Segment header
* Fixed size. LZO_LEN (4) bytes long, LE32.
* Records the total size of the segment (not including the header).
- * Segment header never crosses page boundary, thus it's possible to
- * have at most 3 padding zeros at the end of the page.
+ * Segment header never crosses sector boundary, thus it's possible to
+ * have at most 3 padding zeros at the end of the sector.
*
* 2.2 Data Payload
- * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- * which is 4419 for a 4KiB page.
+ * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ * which is 4419 for a 4KiB sectorsize.
*
- * Example:
+ * Example with 4K sectorsize:
* Page 1:
* 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10
* 0x0000 | Header | SegHdr 01 | Data payload 01 ... |
@@ -54,6 +55,9 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
+#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -82,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
- workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
- workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
+ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -111,312 +115,313 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+ size_t compressed_size,
+ struct page **out_pages,
+ unsigned long max_nr_page,
+ u32 *cur_out,
+ const u32 sectorsize)
+{
+ u32 sector_bytes_left;
+ u32 orig_out;
+ struct page *cur_page;
+ char *kaddr;
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
+ /*
+ * We never allow a segment header crossing sector boundary, previous
+ * run should ensure we have enough space left inside the sector.
+ */
+ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+
+ kaddr = kmap_local_page(cur_page);
+ write_compress_length(kaddr + offset_in_page(*cur_out),
+ compressed_size);
+ *cur_out += LZO_LEN;
+
+ orig_out = *cur_out;
+
+ /* Copy compressed data */
+ while (*cur_out - orig_out < compressed_size) {
+ u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+ orig_out + compressed_size - *cur_out);
+
+ kunmap_local(kaddr);
+
+ if ((*cur_out / PAGE_SIZE) >= max_nr_page)
+ return -E2BIG;
+
+ cur_page = out_pages[*cur_out / PAGE_SIZE];
+ /* Allocate a new page */
+ if (!cur_page) {
+ cur_page = alloc_page(GFP_NOFS);
+ if (!cur_page)
+ return -ENOMEM;
+ out_pages[*cur_out / PAGE_SIZE] = cur_page;
+ }
+ kaddr = kmap_local_page(cur_page);
+
+ memcpy(kaddr + offset_in_page(*cur_out),
+ compressed_data + *cur_out - orig_out, copy_len);
+
+ *cur_out += copy_len;
+ }
+
+ /*
+ * Check if we can fit the next segment header into the remaining space
+ * of the sector.
+ */
+ sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+ if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+ goto out;
+
+ /* The remaining size is not enough, pad it with zeros */
+ memset(kaddr + offset_in_page(*cur_out), 0,
+ sector_bytes_left);
+ *cur_out += sector_bytes_left;
+
+out:
+ kunmap_local(kaddr);
+ return 0;
+}
+
int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+ struct page *page_in = NULL;
+ char *sizes_ptr;
+ const unsigned long max_nr_page = *out_pages;
int ret = 0;
- char *data_in;
- char *cpage_out;
- int nr_pages = 0;
- struct page *in_page = NULL;
- struct page *out_page = NULL;
- unsigned long bytes_left;
- unsigned long len = *total_out;
- unsigned long nr_dest_pages = *out_pages;
- const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- size_t in_len;
- size_t out_len;
- char *buf;
- unsigned long tot_in = 0;
- unsigned long tot_out = 0;
- unsigned long pg_bytes_left;
- unsigned long out_offset;
- unsigned long bytes;
+ /* Points to the file offset of input data */
+ u64 cur_in = start;
+ /* Points to the current output byte */
+ u32 cur_out = 0;
+ u32 len = *total_out;
+ ASSERT(max_nr_page > 0);
*out_pages = 0;
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
-
/*
- * store the size of all chunks of compressed data in
- * the first 4 bytes
+ * Skip the header for now, we will later come back and write the total
+ * compressed size
*/
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = kmap(out_page);
- out_offset = LZO_LEN;
- tot_out = LZO_LEN;
- pages[0] = out_page;
- nr_pages = 1;
- pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
- /* compress at most one page of data each time */
- in_len = min(len, PAGE_SIZE);
- while (tot_in < len) {
- ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
- &out_len, workspace->mem);
- if (ret != LZO_E_OK) {
- pr_debug("BTRFS: lzo in loop returned %d\n",
- ret);
+ cur_out += LZO_LEN;
+ while (cur_in < start + len) {
+ char *data_in;
+ const u32 sectorsize_mask = sectorsize - 1;
+ u32 sector_off = (cur_in - start) & sectorsize_mask;
+ u32 in_len;
+ size_t out_len;
+
+ /* Get the input page first */
+ if (!page_in) {
+ page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+ ASSERT(page_in);
+ }
+
+ /* Compress at most one sector of data each time */
+ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+ ASSERT(in_len);
+ data_in = kmap_local_page(page_in);
+ ret = lzo1x_1_compress(data_in +
+ offset_in_page(cur_in), in_len,
+ workspace->cbuf, &out_len,
+ workspace->mem);
+ kunmap_local(data_in);
+ if (ret < 0) {
+ pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
goto out;
}
- /* store the size of this chunk of compressed data */
- write_compress_length(cpage_out + out_offset, out_len);
- tot_out += LZO_LEN;
- out_offset += LZO_LEN;
- pg_bytes_left -= LZO_LEN;
-
- tot_in += in_len;
- tot_out += out_len;
-
- /* copy bytes from the working buffer into the pages */
- buf = workspace->cbuf;
- while (out_len) {
- bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
- memcpy(cpage_out + out_offset, buf, bytes);
-
- out_len -= bytes;
- pg_bytes_left -= bytes;
- buf += bytes;
- out_offset += bytes;
+ ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+ pages, max_nr_page,
+ &cur_out, sectorsize);
+ if (ret < 0)
+ goto out;
- /*
- * we need another page for writing out.
- *
- * Note if there's less than 4 bytes left, we just
- * skip to a new page.
- */
- if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
- pg_bytes_left == 0) {
- if (pg_bytes_left) {
- memset(cpage_out + out_offset, 0,
- pg_bytes_left);
- tot_out += pg_bytes_left;
- }
-
- /* we're done, don't allocate new page */
- if (out_len == 0 && tot_in >= len)
- break;
-
- kunmap(out_page);
- if (nr_pages == nr_dest_pages) {
- out_page = NULL;
- ret = -E2BIG;
- goto out;
- }
-
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (out_page == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- cpage_out = kmap(out_page);
- pages[nr_pages++] = out_page;
-
- pg_bytes_left = PAGE_SIZE;
- out_offset = 0;
- }
- }
+ cur_in += in_len;
- /* we're making it bigger, give up */
- if (tot_in > 8192 && tot_in < tot_out) {
+ /*
+ * Check if we're making it bigger after two sectors. And if
+ * it is so, give up.
+ */
+ if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
ret = -E2BIG;
goto out;
}
- /* we're all done */
- if (tot_in >= len)
- break;
-
- if (tot_out > max_out)
- break;
+ /* Check if we have reached page boundary */
+ if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+ put_page(page_in);
+ page_in = NULL;
+ }
+ }
- bytes_left = len - tot_in;
- kunmap(in_page);
- put_page(in_page);
+ /* Store the size of all chunks of compressed data */
+ sizes_ptr = kmap_local_page(pages[0]);
+ write_compress_length(sizes_ptr, cur_out);
+ kunmap_local(sizes_ptr);
- start += PAGE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
- in_len = min(bytes_left, PAGE_SIZE);
- }
+ ret = 0;
+ *total_out = cur_out;
+ *total_in = cur_in - start;
+out:
+ if (page_in)
+ put_page(page_in);
+ *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
+ return ret;
+}
- if (tot_out >= tot_in) {
- ret = -E2BIG;
- goto out;
- }
+/*
+ * Copy the compressed segment payload into @dest.
+ *
+ * For the payload there will be no padding, just need to do page switching.
+ */
+static void copy_compressed_segment(struct compressed_bio *cb,
+ char *dest, u32 len, u32 *cur_in)
+{
+ u32 orig_in = *cur_in;
- /* store the size of all chunks of compressed data */
- cpage_out = kmap(pages[0]);
- write_compress_length(cpage_out, tot_out);
+ while (*cur_in < orig_in + len) {
+ struct page *cur_page;
+ u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
+ orig_in + len - *cur_in);
- kunmap(pages[0]);
+ ASSERT(copy_len);
+ cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
- ret = 0;
- *total_out = tot_out;
- *total_in = tot_in;
-out:
- *out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
+ memcpy_from_page(dest + *cur_in - orig_in, cur_page,
+ offset_in_page(*cur_in), copy_len);
- if (in_page) {
- kunmap(in_page);
- put_page(in_page);
+ *cur_in += copy_len;
}
-
- return ret;
}
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- int ret = 0, ret2;
- char *data_in;
- unsigned long page_in_index = 0;
- size_t srclen = cb->compressed_len;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
- unsigned long buf_start;
- unsigned long buf_offset = 0;
- unsigned long bytes;
- unsigned long working_bytes;
- size_t in_len;
- size_t out_len;
- const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
- unsigned long in_offset;
- unsigned long in_page_bytes_left;
- unsigned long tot_in;
- unsigned long tot_out;
- unsigned long tot_len;
- char *buf;
- bool may_late_unmap, need_unmap;
- struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
-
- data_in = kmap(pages_in[0]);
- tot_len = read_compress_length(data_in);
+ const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ char *kaddr;
+ int ret;
+ /* Compressed data length, can be unaligned */
+ u32 len_in;
+ /* Offset inside the compressed data */
+ u32 cur_in = 0;
+ /* Bytes decompressed so far */
+ u32 cur_out = 0;
+
+ kaddr = kmap_local_page(cb->compressed_pages[0]);
+ len_in = read_compress_length(kaddr);
+ kunmap_local(kaddr);
+ cur_in += LZO_LEN;
+
/*
- * Compressed data header check.
+ * LZO header length check
*
- * The real compressed size can't exceed the maximum extent length, and
- * all pages should be used (whole unused page with just the segment
- * header is not possible). If this happens it means the compressed
- * extent is corrupted.
+ * The total length should not exceed the maximum extent length,
+ * and all sectors should be used.
+ * If this happens, it means the compressed extent is corrupted.
*/
- if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
- tot_len < srclen - PAGE_SIZE) {
- ret = -EUCLEAN;
- goto done;
+ if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len) {
+ btrfs_err(fs_info,
+ "invalid lzo header, lzo len %u compressed len %u",
+ len_in, cb->compressed_len);
+ return -EUCLEAN;
}
- tot_in = LZO_LEN;
- in_offset = LZO_LEN;
- in_page_bytes_left = PAGE_SIZE - LZO_LEN;
-
- tot_out = 0;
-
- while (tot_in < tot_len) {
- in_len = read_compress_length(data_in + in_offset);
- in_page_bytes_left -= LZO_LEN;
- in_offset += LZO_LEN;
- tot_in += LZO_LEN;
+ /* Go through each lzo segment */
+ while (cur_in < len_in) {
+ struct page *cur_page;
+ /* Length of the compressed segment */
+ u32 seg_len;
+ u32 sector_bytes_left;
+ size_t out_len = lzo1x_worst_compress(sectorsize);
/*
- * Segment header check.
- *
- * The segment length must not exceed the maximum LZO
- * compression size, nor the total compressed size.
+ * We should always have enough space for one segment header
+ * inside current sector.
*/
- if (in_len > max_segment_len || tot_in + in_len > tot_len) {
- ret = -EUCLEAN;
- goto done;
- }
-
- tot_in += in_len;
- working_bytes = in_len;
- may_late_unmap = need_unmap = false;
-
- /* fast path: avoid using the working buffer */
- if (in_page_bytes_left >= in_len) {
- buf = data_in + in_offset;
- bytes = in_len;
- may_late_unmap = true;
- goto cont;
+ ASSERT(cur_in / sectorsize ==
+ (cur_in + LZO_LEN - 1) / sectorsize);
+ cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+ ASSERT(cur_page);
+ kaddr = kmap_local_page(cur_page);
+ seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
+ kunmap_local(kaddr);
+ cur_in += LZO_LEN;
+
+ if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ /*
+ * seg_len shouldn't be larger than we have allocated
+ * for workspace->cbuf
+ */
+ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ seg_len);
+ ret = -EIO;
+ goto out;
}
- /* copy bytes from the pages into the working buffer */
- buf = workspace->cbuf;
- buf_offset = 0;
- while (working_bytes) {
- bytes = min(working_bytes, in_page_bytes_left);
-
- memcpy(buf + buf_offset, data_in + in_offset, bytes);
- buf_offset += bytes;
-cont:
- working_bytes -= bytes;
- in_page_bytes_left -= bytes;
- in_offset += bytes;
-
- /* check if we need to pick another page */
- if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
- || in_page_bytes_left == 0) {
- tot_in += in_page_bytes_left;
-
- if (working_bytes == 0 && tot_in >= tot_len)
- break;
-
- if (page_in_index + 1 >= total_pages_in) {
- ret = -EIO;
- goto done;
- }
-
- if (may_late_unmap)
- need_unmap = true;
- else
- kunmap(pages_in[page_in_index]);
-
- data_in = kmap(pages_in[++page_in_index]);
-
- in_page_bytes_left = PAGE_SIZE;
- in_offset = 0;
- }
- }
+ /* Copy the compressed segment payload into workspace */
+ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
- out_len = max_segment_len;
- ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
- &out_len);
- if (need_unmap)
- kunmap(pages_in[page_in_index - 1]);
+ /* Decompress the data */
+ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
+ workspace->buf, &out_len);
if (ret != LZO_E_OK) {
- pr_warn("BTRFS: decompress failed\n");
+ btrfs_err(fs_info, "failed to decompress");
ret = -EIO;
- break;
+ goto out;
}
- buf_start = tot_out;
- tot_out += out_len;
+ /* Copy the data into inode pages */
+ ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
+ cur_out += out_len;
+
+ /* All data read, exit */
+ if (ret == 0)
+ goto out;
+ ret = 0;
- ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- tot_out, disk_start, orig_bio);
- if (ret2 == 0)
- break;
+ /* Check if the sector has enough space for a segment header */
+ sector_bytes_left = sectorsize - (cur_in % sectorsize);
+ if (sector_bytes_left >= LZO_LEN)
+ continue;
+
+ /* Skip the padding zeros */
+ cur_in += sector_bytes_left;
}
-done:
- kunmap(pages_in[page_in_index]);
+out:
if (!ret)
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
return ret;
}
@@ -427,7 +432,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
- size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
+ size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
char *kaddr;
unsigned long bytes;
@@ -467,7 +472,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = kmap_atomic(dest_page);
+ kaddr = kmap_local_page(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -477,7 +482,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 72bab64ecf60..f9850edfd726 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -5,7 +5,8 @@
#include <linux/sched.h>
#include <linux/wait.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
+#include <linux/rbtree.h>
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
@@ -58,4 +59,92 @@ static inline bool has_single_bit_set(u64 n)
return is_power_of_two_u64(n);
}
+/*
+ * Simple bytenr based rb_tree relate structures
+ *
+ * Any structure wants to use bytenr as single search index should have their
+ * structure start with these members.
+ */
+struct rb_simple_node {
+ struct rb_node rb_node;
+ u64 bytenr;
+};
+
+static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_simple_node *entry;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr)
+ node = node->rb_left;
+ else if (bytenr > entry->bytenr)
+ node = node->rb_right;
+ else
+ return node;
+ }
+ return NULL;
+}
+
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root: the root to search.
+ * @bytenr: bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr. If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+ u64 bytenr)
+{
+ struct rb_node *node = root->rb_node, *ret = NULL;
+ struct rb_simple_node *entry, *ret_entry = NULL;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr) {
+ if (!ret || entry->bytenr < ret_entry->bytenr) {
+ ret = node;
+ ret_entry = entry;
+ }
+
+ node = node->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ node = node->rb_right;
+ } else {
+ return node;
+ }
+ }
+
+ return ret;
+}
+
+static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_simple_node *entry;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return parent;
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a65f189a5b94..e54f8280031f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -15,6 +15,8 @@
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "qgroup.h"
+#include "subpage.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -106,17 +108,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
return NULL;
}
-/*
- * helper to check if a given offset is inside a given entry
- */
-static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
-{
- if (file_offset < entry->file_offset ||
- entry->file_offset + entry->num_bytes <= file_offset)
- return 0;
- return 1;
-}
-
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
@@ -141,7 +132,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
if (tree->last) {
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
rb_node);
- if (offset_in_entry(entry, file_offset))
+ if (in_range(file_offset, entry->file_offset, entry->num_bytes))
return tree->last;
}
ret = __tree_search(root, file_offset, &prev);
@@ -152,53 +143,83 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/* allocate and add a new ordered_extent into the per-inode tree.
+/**
+ * Add an ordered extent to the per-inode tree.
*
- * The tree is given a single reference on the ordered extent that was
- * inserted.
+ * @inode: Inode that this extent is for.
+ * @file_offset: Logical offset in file where the extent starts.
+ * @num_bytes: Logical length of extent in file.
+ * @ram_bytes: Full length of unencoded data.
+ * @disk_bytenr: Offset of extent on disk.
+ * @disk_num_bytes: Size of extent on disk.
+ * @offset: Offset into unencoded data where file data starts.
+ * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type: Compression algorithm used for data.
+ *
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted.
+ *
+ * Return: 0 or -ENOMEM.
*/
-static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type, int dio,
- int compress_type)
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
+ int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
+ if (flags &
+ ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
+ /* For nocow write, we can release the qgroup rsv right now */
+ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+ } else {
+ /*
+ * The ordered extent has reserved qgroup space, release now
+ * and pass the reserved number for qgroup_record to free.
+ */
+ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ if (ret < 0)
+ return ret;
+ }
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
entry->file_offset = file_offset;
- entry->disk_bytenr = disk_bytenr;
entry->num_bytes = num_bytes;
+ entry->ram_bytes = ram_bytes;
+ entry->disk_bytenr = disk_bytenr;
entry->disk_num_bytes = disk_num_bytes;
+ entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = igrab(inode);
+ entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
- if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
- set_bit(type, &entry->flags);
+ entry->qgroup_rsv = ret;
+ entry->physical = (u64)-1;
- if (dio) {
- percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
- fs_info->delalloc_batch);
- set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
- }
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+ entry->flags = flags;
+
+ percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
+ fs_info->delalloc_batch);
/* one ref for the tree */
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- INIT_LIST_HEAD(&entry->log_list);
- INIT_LIST_HEAD(&entry->trans_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -228,41 +249,13 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
* that work has been done at higher layers, so this is truly the
* smallest the extent is going to get.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, 1);
+ spin_unlock(&inode->lock);
return 0;
}
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type)
-{
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type)
-{
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 1,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type,
- int compress_type)
-{
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
- compress_type);
-}
-
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -279,100 +272,178 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
spin_unlock_irq(&tree->lock);
}
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
+}
+
/*
- * this is used to account for finished IO across a given range
- * of the file. The IO may span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
+ * Mark all ordered extents io inside the specified range finished.
*
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
+ * @page: The involved page for the operation.
+ * For uncompressed buffered IO, the page status also needs to be
+ * updated to indicate whether the pending ordered io is finished.
+ * Can be NULL for direct IO and compressed write.
+ * For these cases, callers are ensured they won't execute the
+ * endio function twice.
*
- * file_offset is updated to one byte past the range that is recorded as
- * complete. This allows you to walk forward in the file.
+ * This function is called for endio, thus the range must have ordered
+ * extent(s) covering it.
*/
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size, int uptodate)
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_workqueue *wq;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- int ret;
unsigned long flags;
- u64 dec_end;
- u64 dec_start;
- u64 to_dec;
+ u64 cur = file_offset;
+
+ if (btrfs_is_free_space_inode(inode))
+ wq = fs_info->endio_freespace_worker;
+ else
+ wq = fs_info->endio_write_workers;
+
+ if (page)
+ ASSERT(page->mapping && page_offset(page) <= file_offset &&
+ file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
- node = tree_search(tree, *file_offset);
- if (!node) {
- ret = 1;
- goto out;
- }
+ while (cur < file_offset + num_bytes) {
+ u64 entry_end;
+ u64 end;
+ u32 len;
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, *file_offset)) {
- ret = 1;
- goto out;
- }
+ node = tree_search(tree, cur);
+ /* No ordered extents at all */
+ if (!node)
+ break;
- dec_start = max(*file_offset, entry->file_offset);
- dec_end = min(*file_offset + io_size,
- entry->file_offset + entry->num_bytes);
- *file_offset = dec_end;
- if (dec_start > dec_end) {
- btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
- dec_start, dec_end);
- }
- to_dec = dec_end - dec_start;
- if (to_dec > entry->bytes_left) {
- btrfs_crit(fs_info,
- "bad ordered accounting left %llu size %llu",
- entry->bytes_left, to_dec);
- }
- entry->bytes_left -= to_dec;
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ entry_end = entry->file_offset + entry->num_bytes;
+ /*
+ * |<-- OE --->| |
+ * cur
+ * Go to next OE.
+ */
+ if (cur >= entry_end) {
+ node = rb_next(node);
+ /* No more ordered extents, exit */
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_ordered_extent,
+ rb_node);
+
+ /* Go to next ordered extent and continue */
+ cur = entry->file_offset;
+ continue;
+ }
+ /*
+ * | |<--- OE --->|
+ * cur
+ * Go to the start of OE.
+ */
+ if (cur < entry->file_offset) {
+ cur = entry->file_offset;
+ continue;
+ }
- if (entry->bytes_left == 0) {
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /* test_and_set_bit implies a barrier */
- cond_wake_up_nomb(&entry->wait);
- } else {
- ret = 1;
- }
-out:
- if (!ret && cached && entry) {
- *cached = entry;
- refcount_inc(&entry->refs);
+ /*
+ * Now we are definitely inside one ordered extent.
+ *
+ * |<--- OE --->|
+ * |
+ * cur
+ */
+ end = min(entry->file_offset + entry->num_bytes,
+ file_offset + num_bytes) - 1;
+ ASSERT(end + 1 - cur < U32_MAX);
+ len = end + 1 - cur;
+
+ if (page) {
+ /*
+ * Ordered (Private2) bit indicates whether we still
+ * have pending io unfinished for the ordered extent.
+ *
+ * If there's no such bit, we need to skip to next range.
+ */
+ if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
+ cur += len;
+ continue;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, len);
+ }
+
+ /* Now we're fine to update the accounting */
+ if (unlikely(len > entry->bytes_left)) {
+ WARN_ON(1);
+ btrfs_crit(fs_info,
+"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode),
+ entry->file_offset,
+ entry->num_bytes,
+ len, entry->bytes_left);
+ entry->bytes_left = 0;
+ } else {
+ entry->bytes_left -= len;
+ }
+
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+ /*
+ * All the IO of the ordered extent is finished, we need to queue
+ * the finish_func to be executed.
+ */
+ if (entry->bytes_left == 0) {
+ set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ cond_wake_up(&entry->wait);
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_mark_finished(inode, entry);
+ spin_unlock_irqrestore(&tree->lock, flags);
+ btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(wq, &entry->work);
+ spin_lock_irqsave(&tree->lock, flags);
+ }
+ cur += len;
}
spin_unlock_irqrestore(&tree->lock, flags);
- return ret == 0;
}
/*
- * this is used to account for finished IO across a given range
- * of the file. The IO should not span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
+ * Finish IO for one ordered extent across a given range. The range can only
+ * contain one ordered extent.
*
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
+ * @cached: The cached ordered extent. If not NULL, we can skip the tree
+ * search and use the ordered extent directly.
+ * Will be also used to store the finished ordered extent.
+ * @file_offset: File offset for the finished IO
+ * @io_size: Length of the finish IO range
+ *
+ * Return true if the ordered extent is finished in the range, and update
+ * @cached.
+ * Return false otherwise.
+ *
+ * NOTE: The range can NOT cross multiple ordered extents.
+ * Thus caller should ensure the range doesn't cross ordered extents.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate)
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
- int ret;
+ bool finished = false;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -380,41 +451,38 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
}
node = tree_search(tree, file_offset);
- if (!node) {
- ret = 1;
+ if (!node)
goto out;
- }
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
- if (!offset_in_entry(entry, file_offset)) {
- ret = 1;
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
goto out;
- }
- if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ if (io_size > entry->bytes_left)
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
- }
+
entry->bytes_left -= io_size;
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Ensure only one caller can set the flag and finished_ret
+ * accordingly
+ */
+ finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
/* test_and_set_bit implies a barrier */
cond_wake_up_nomb(&entry->wait);
- } else {
- ret = 1;
}
out:
- if (!ret && cached && entry) {
+ if (finished && cached && entry) {
*cached = entry;
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
spin_unlock_irqrestore(&tree->lock, flags);
- return ret == 0;
+ return finished;
}
/*
@@ -426,12 +494,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
- ASSERT(list_empty(&entry->log_list));
- ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -449,26 +516,39 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
+ bool freespace_inode;
+
+ /*
+ * If this is a free space inode the thread has not acquired the ordered
+ * extents lockdep map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
- if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
- false);
+ if (root != fs_info->tree_root) {
+ u64 release;
- if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
- fs_info->delalloc_batch);
+ if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
+ release = entry->disk_num_bytes;
+ else
+ release = entry->num_bytes;
+ btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+ }
+
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
+ fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -478,13 +558,43 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
+ btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -494,6 +604,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
}
spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
+ if (!freespace_inode)
+ btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
}
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
@@ -501,7 +613,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -580,7 +692,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
@@ -588,7 +700,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
- btrfs_put_fs_root(root);
+ btrfs_put_root(root);
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
@@ -607,23 +719,31 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
/*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
+ /*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
+ if (!freespace_inode)
+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
}
@@ -666,7 +786,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -677,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -698,26 +818,29 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* find an ordered extent corresponding to file_offset. return NULL if
* nothing is found, otherwise take a reference on the extent and return it
*/
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
- tree = &BTRFS_I(inode)->ordered_tree;
- spin_lock_irq(&tree->lock);
+ tree = &inode->ordered_tree;
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, file_offset);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, file_offset))
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup(inode, entry);
+ }
out:
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return entry;
}
@@ -755,24 +878,55 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
break;
}
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_range(inode, entry);
+ }
spin_unlock_irq(&tree->lock);
return entry;
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -780,190 +934,94 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
spin_unlock_irq(&tree->lock);
return entry;
}
/*
- * After an extent is done, call this to conditionally update the on disk
- * i_size. i_size is updated to cover any fully written part of the file.
+ * Lookup the first ordered extent that overlaps the range
+ * [@file_offset, @file_offset + @len).
+ *
+ * The difference between this and btrfs_lookup_first_ordered_extent() is
+ * that this one won't return any ordered extent that does not overlap the range.
+ * And the difference against btrfs_lookup_ordered_extent() is, this function
+ * ensures the first ordered extent gets returned.
*/
-int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
- struct btrfs_ordered_extent *ordered)
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- u64 disk_i_size;
- u64 new_i_size;
- u64 i_size = i_size_read(inode);
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
- struct rb_node *prev = NULL;
- struct btrfs_ordered_extent *test;
- int ret = 1;
- u64 orig_offset = offset;
+ struct rb_node *cur;
+ struct rb_node *prev;
+ struct rb_node *next;
+ struct btrfs_ordered_extent *entry = NULL;
spin_lock_irq(&tree->lock);
- if (ordered) {
- offset = entry_end(ordered);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
- offset = min(offset,
- ordered->file_offset +
- ordered->truncated_len);
- } else {
- offset = ALIGN(offset, btrfs_inode_sectorsize(inode));
- }
- disk_i_size = BTRFS_I(inode)->disk_i_size;
-
- /*
- * truncate file.
- * If ordered is not NULL, then this is called from endio and
- * disk_i_size will be updated by either truncate itself or any
- * in-flight IOs which are inside the disk_i_size.
- *
- * Because btrfs_setsize() may set i_size with disk_i_size if truncate
- * fails somehow, we need to make sure we have a precise disk_i_size by
- * updating it as usual.
- *
- */
- if (!ordered && disk_i_size > i_size) {
- BTRFS_I(inode)->disk_i_size = orig_offset;
- ret = 0;
- goto out;
- }
-
- /*
- * if the disk i_size is already at the inode->i_size, or
- * this ordered extent is inside the disk i_size, we're done
- */
- if (disk_i_size == i_size)
- goto out;
-
+ node = tree->tree.rb_node;
/*
- * We still need to update disk_i_size if outstanding_isize is greater
- * than disk_i_size.
+ * Here we don't want to use tree_search() which will use tree->last
+ * and screw up the search order.
+ * And __tree_search() can't return the adjacent ordered extents
+ * either, thus here we do our own search.
*/
- if (offset <= disk_i_size &&
- (!ordered || ordered->outstanding_isize <= disk_i_size))
- goto out;
+ while (node) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- /*
- * walk backward from this ordered extent to disk_i_size.
- * if we find an ordered extent then we can't update disk i_size
- * yet
- */
- if (ordered) {
- node = rb_prev(&ordered->rb_node);
- } else {
- prev = tree_search(tree, offset);
- /*
- * we insert file extents without involving ordered struct,
- * so there should be no ordered struct cover this offset
- */
- if (prev) {
- test = rb_entry(prev, struct btrfs_ordered_extent,
- rb_node);
- BUG_ON(offset_in_entry(test, offset));
+ if (file_offset < entry->file_offset) {
+ node = node->rb_left;
+ } else if (file_offset >= entry_end(entry)) {
+ node = node->rb_right;
+ } else {
+ /*
+ * Direct hit, got an ordered extent that starts at
+ * @file_offset
+ */
+ goto out;
}
- node = prev;
}
- for (; node; node = rb_prev(node)) {
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
- /* We treat this entry as if it doesn't exist */
- if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
- continue;
-
- if (entry_end(test) <= disk_i_size)
- break;
- if (test->file_offset >= i_size)
- break;
-
- /*
- * We don't update disk_i_size now, so record this undealt
- * i_size. Or we will not know the real i_size.
- */
- if (test->outstanding_isize < offset)
- test->outstanding_isize = offset;
- if (ordered &&
- ordered->outstanding_isize > test->outstanding_isize)
- test->outstanding_isize = ordered->outstanding_isize;
+ if (!entry) {
+ /* Empty tree */
goto out;
}
- new_i_size = min_t(u64, offset, i_size);
-
- /*
- * Some ordered extents may completed before the current one, and
- * we hold the real i_size in ->outstanding_isize.
- */
- if (ordered && ordered->outstanding_isize > new_i_size)
- new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
- BTRFS_I(inode)->disk_i_size = new_i_size;
- ret = 0;
-out:
- /*
- * We need to do this because we can't remove ordered extents until
- * after the i_disk_size has been updated and then the inode has been
- * updated to reflect the change, so we need to tell anybody who finds
- * this ordered extent that we've already done all the real work, we
- * just haven't completed all the other work.
- */
- if (ordered)
- set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
- spin_unlock_irq(&tree->lock);
- return ret;
-}
-/*
- * search the ordered extents for one corresponding to 'offset' and
- * try to find a checksum. This is used because we allow pages to
- * be reclaimed before their checksum is actually put into the btree
- */
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- unsigned long num_sectors;
- unsigned long i;
- u32 sectorsize = btrfs_inode_sectorsize(inode);
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int index = 0;
-
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- if (!ordered)
- return 0;
-
- spin_lock_irq(&tree->lock);
- list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
- if (disk_bytenr >= ordered_sum->bytenr &&
- disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = min_t(int, len - index, num_sectors - i);
- memcpy(sum + index, ordered_sum->sums + i * csum_size,
- num_sectors * csum_size);
-
- index += (int)num_sectors * csum_size;
- if (index == len)
- goto out;
- disk_bytenr += num_sectors * sectorsize;
- }
+ cur = &entry->rb_node;
+ /* We got an entry around @file_offset, check adjacent entries */
+ if (entry->file_offset < file_offset) {
+ prev = cur;
+ next = rb_next(cur);
+ } else {
+ prev = rb_prev(cur);
+ next = cur;
+ }
+ if (prev) {
+ entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
}
+ if (next) {
+ entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ /* No ordered extent in the range */
+ entry = NULL;
out:
+ if (entry) {
+ refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
+ }
+
spin_unlock_irq(&tree->lock);
- btrfs_put_ordered_extent(ordered);
- return index;
+ return entry;
}
/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
- * @tree: IO tree used for locking out other users of the range
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
@@ -973,8 +1031,7 @@ out:
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
*/
-void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
- struct btrfs_inode *inode, u64 start,
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state)
{
@@ -986,7 +1043,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
cachedp = cached_state;
while (1) {
- lock_extent_bits(tree, start, end, cachedp);
+ lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -999,12 +1056,101 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ unlock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
+/*
+ * Lock the passed range and ensure all pending ordered extents in it are run
+ * to completion in nowait mode.
+ *
+ * Return true if btrfs_lock_ordered_range does not return any extents,
+ * otherwise false.
+ */
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, start, end))
+ return false;
+
+ ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
+ if (!ordered)
+ return true;
+
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&inode->io_tree, start, end, NULL);
+
+ return false;
+}
+
+
+static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
+ u64 len)
+{
+ struct inode *inode = ordered->inode;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ u64 file_offset = ordered->file_offset + pos;
+ u64 disk_bytenr = ordered->disk_bytenr + pos;
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+
+ /*
+ * The splitting extent is already counted and will be added again in
+ * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
+ */
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
+ fs_info->delalloc_batch);
+ WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+ disk_bytenr, len, 0, flags,
+ ordered->compress_type);
+}
+
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post)
+{
+ struct inode *inode = ordered->inode;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct rb_node *node;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret = 0;
+
+ trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+
+ spin_lock_irq(&tree->lock);
+ /* Remove from tree once */
+ node = &ordered->rb_node;
+ rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
+ if (tree->last == node)
+ tree->last = NULL;
+
+ ordered->file_offset += pre;
+ ordered->disk_bytenr += pre;
+ ordered->num_bytes -= (pre + post);
+ ordered->disk_num_bytes -= (pre + post);
+ ordered->bytes_left -= (pre + post);
+
+ /* Re-insert the node */
+ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+ if (node)
+ btrfs_panic(fs_info, -EEXIST,
+ "zoned: inconsistency in ordered tree at offset %llu",
+ ordered->file_offset);
+
+ spin_unlock_irq(&tree->lock);
+
+ if (pre)
+ ret = clone_ordered_extent(ordered, 0, pre);
+ if (ret == 0 && post)
+ ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
+ post);
+
+ return ret;
+}
+
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3beb4da4ab41..f59f2dbdb25e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -27,7 +27,7 @@ struct btrfs_ordered_sum {
};
/*
- * bits for the flags field:
+ * Bits for btrfs_ordered_extent::flags.
*
* BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
* It is used to make sure metadata is inserted into the tree only once
@@ -38,31 +38,54 @@ struct btrfs_ordered_sum {
* IO is done and any metadata is inserted into the tree.
*/
enum {
+ /*
+ * Different types for ordered extents, one and only one of the 4 types
+ * need to be set when creating ordered extent.
+ *
+ * REGULAR: For regular non-compressed COW write
+ * NOCOW: For NOCOW write into existing non-hole extent
+ * PREALLOC: For NOCOW write into preallocated extent
+ * COMPRESSED: For compressed COW write
+ */
+ BTRFS_ORDERED_REGULAR,
+ BTRFS_ORDERED_NOCOW,
+ BTRFS_ORDERED_PREALLOC,
+ BTRFS_ORDERED_COMPRESSED,
+
+ /*
+ * Extra bit for direct io, can only be set for
+ * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent.
+ */
+ BTRFS_ORDERED_DIRECT,
+
+ /* Extra status bits for ordered extents */
+
/* set when all the pages are written */
BTRFS_ORDERED_IO_DONE,
/* set when removed from the tree */
BTRFS_ORDERED_COMPLETE,
- /* set when we want to write in place */
- BTRFS_ORDERED_NOCOW,
- /* writing a zlib compressed extent */
- BTRFS_ORDERED_COMPRESSED,
- /* set when writing to preallocated extent */
- BTRFS_ORDERED_PREALLOC,
- /* set when we're doing DIO with this extent */
- BTRFS_ORDERED_DIRECT,
/* We had an io error when writing this out */
BTRFS_ORDERED_IOERR,
- /*
- * indicates whether this ordered extent has done its due diligence in
- * updating the isize
- */
- BTRFS_ORDERED_UPDATED_ISIZE,
/* Set when we have to truncate an extent */
BTRFS_ORDERED_TRUNCATED,
- /* Regular IO for COW */
- BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
+ /* BTRFS_IOC_ENCODED_WRITE */
+ BTRFS_ORDERED_ENCODED,
};
+/* BTRFS_ORDERED_* flags that specify the type of the extent. */
+#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \
+ (1UL << BTRFS_ORDERED_NOCOW) | \
+ (1UL << BTRFS_ORDERED_PREALLOC) | \
+ (1UL << BTRFS_ORDERED_COMPRESSED) | \
+ (1UL << BTRFS_ORDERED_DIRECT) | \
+ (1UL << BTRFS_ORDERED_ENCODED))
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -71,9 +94,11 @@ struct btrfs_ordered_extent {
* These fields directly correspond to the same fields in
* btrfs_file_extent_item.
*/
- u64 disk_bytenr;
u64 num_bytes;
+ u64 ram_bytes;
+ u64 disk_bytenr;
u64 disk_num_bytes;
+ u64 offset;
/* number of bytes that still need writing */
u64 bytes_left;
@@ -97,6 +122,9 @@ struct btrfs_ordered_extent {
/* compression algorithm */
int compress_type;
+ /* Qgroup reserved space */
+ int qgroup_rsv;
+
/* reference count */
refcount_t refs;
@@ -106,12 +134,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
- /* If we need to wait on this to be done */
+ /* used for fast fsyncs */
struct list_head log_list;
- /* If the transaction needs to wait on this ordered extent */
- struct list_head trans_list;
-
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -126,20 +151,14 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
-};
-/*
- * calculates the total size you need to allocate for an ordered sum
- * structure spanning 'bytes' in the file
- */
-static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
- unsigned long bytes)
-{
- int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
- int csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
-}
+ /*
+ * Used to reverse-map physical address returned from ZONE_APPEND write
+ * command in a workqueue context
+ */
+ u64 physical;
+ struct block_device *bdev;
+};
static inline void
btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
@@ -149,51 +168,47 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate);
-int btrfs_dec_test_first_ordered_pending(struct inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 *file_offset, u64 io_size,
- int uptodate);
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type);
-int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type);
-int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type,
- int compress_type);
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate);
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
-struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
- struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
-void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
- struct btrfs_inode *inode, u64 start,
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 61f44e78e3c9..dd8777872143 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,45 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
+{
+ int i;
+
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", key->offset);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == key->objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", key->objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
@@ -47,7 +86,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u64 flags;
u64 offset;
int ref_index = 0;
@@ -95,9 +134,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -112,8 +152,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
@@ -137,8 +178,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
__le64 subvol_id;
read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
- pr_info("\t\tsubvol_id %llu\n",
- (unsigned long long)le64_to_cpu(subvol_id));
+ pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id));
item_size -= sizeof(u64);
offset += sizeof(u64);
}
@@ -151,15 +191,8 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
static void print_eb_refs_lock(struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
- btrfs_info(eb->fs_info,
-"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
- atomic_read(&eb->refs), eb->write_locks,
- atomic_read(&eb->read_locks),
- eb->blocking_writers,
- atomic_read(&eb->blocking_readers),
- eb->spinning_writers,
- atomic_read(&eb->spinning_readers),
- eb->lock_owner, current->pid);
+ btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
+ atomic_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
@@ -168,7 +201,6 @@ void btrfs_print_leaf(struct extent_buffer *l)
struct btrfs_fs_info *fs_info;
int i;
u32 type, nr;
- struct btrfs_item *item;
struct btrfs_root_item *ri;
struct btrfs_dir_item *di;
struct btrfs_inode_item *ii;
@@ -192,12 +224,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
btrfs_leaf_free_space(l), btrfs_header_owner(l));
print_eb_refs_lock(l);
for (i = 0 ; i < nr ; i++) {
- item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
type = key.type;
pr_info("\titem %d key (%llu %u %llu) itemoff %d itemsize %d\n",
i, key.objectid, type, key.offset,
- btrfs_item_offset(l, item), btrfs_item_size(l, item));
+ btrfs_item_offset(l, i), btrfs_item_size(l, i));
switch (type) {
case BTRFS_INODE_ITEM_KEY:
ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
@@ -315,7 +346,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
case BTRFS_UUID_KEY_SUBVOL:
case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
- btrfs_item_size_nr(l, i));
+ btrfs_item_size(l, i));
break;
}
}
@@ -358,11 +389,12 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_node_key_to_cpu(c, &first_key, i);
next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+ btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
- if (IS_ERR(next)) {
+ if (IS_ERR(next))
continue;
- } else if (!extent_buffer_uptodate(next)) {
+ if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
continue;
}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38fd75ad..8c3e9319ec4e 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(const struct btrfs_key *key, char *buf);
#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index deb59e7cfcac..055a631276ce 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
struct prop_handler {
struct hlist_node node;
const char *xattr_name;
- int (*validate)(const char *value, size_t len);
+ int (*validate)(const struct btrfs_inode *inode, const char *value,
+ size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
const char *(*extract)(struct inode *inode);
+ bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -55,7 +57,8 @@ find_prop_handler(const char *name,
return NULL;
}
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len)
{
const struct prop_handler *handler;
@@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
if (value_len == 0)
return 0;
- return handler->validate(value, value_len);
+ return handler->validate(inode, value, value_len);
+}
+
+/*
+ * Check if a property should be ignored (not set) for an inode.
+ *
+ * @inode: The target inode.
+ * @name: The property's name.
+ *
+ * The caller must be sure the given property name is valid, for example by
+ * having previously called btrfs_validate_prop().
+ *
+ * Returns: true if the property should be ignored for the given inode
+ * false if the property must not be ignored for the given inode
+ */
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
+{
+ const struct prop_handler *handler;
+
+ handler = find_prop_handler(name, NULL);
+ ASSERT(handler != NULL);
+
+ return handler->ignore(inode);
}
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -158,7 +183,7 @@ static int iterate_object_props(struct btrfs_root *root,
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
cur = 0;
- total_len = btrfs_item_size_nr(leaf, slot);
+ total_len = btrfs_item_size(leaf, slot);
while (cur < total_len) {
u32 name_len = btrfs_dir_name_len(leaf, di);
@@ -245,21 +270,26 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 ino = btrfs_ino(BTRFS_I(inode));
- int ret;
-
- ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
- return ret;
+ return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
}
-static int prop_compression_validate(const char *value, size_t len)
+static int prop_compression_validate(const struct btrfs_inode *inode,
+ const char *value, size_t len)
{
+ if (!btrfs_inode_can_compress(inode))
+ return -EINVAL;
+
if (!value)
return 0;
if (btrfs_compress_is_valid_type(value, len))
return 0;
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0))
+ return 0;
+
return -EINVAL;
}
@@ -269,7 +299,17 @@ static int prop_compression_apply(struct inode *inode, const char *value,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int type;
+ /* Reset to defaults */
if (len == 0) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ return 0;
+ }
+
+ /* Set NOCOMPRESS flag */
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
@@ -296,6 +336,22 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return 0;
}
+static bool prop_compression_ignore(const struct btrfs_inode *inode)
+{
+ /*
+ * Compression only has effect for regular files, and for directories
+ * we set it just to propagate it to new files created inside them.
+ * Everything else (symlinks, devices, sockets, fifos) is pointless as
+ * it will do nothing, so don't waste metadata space on a compression
+ * xattr for anything that is neither a file nor a directory.
+ */
+ if (!S_ISREG(inode->vfs_inode.i_mode) &&
+ !S_ISDIR(inode->vfs_inode.i_mode))
+ return true;
+
+ return false;
+}
+
static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
@@ -316,13 +372,13 @@ static struct prop_handler prop_handlers[] = {
.validate = prop_compression_validate,
.apply = prop_compression_apply,
.extract = prop_compression_extract,
+ .ignore = prop_compression_ignore,
.inheritable = 1
},
};
-static int inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *parent)
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -342,15 +398,18 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
+ if (h->ignore(BTRFS_I(inode)))
+ continue;
+
value = h->extract(parent);
if (!value)
continue;
/*
* This is not strictly necessary as the property should be
- * valid, but in case it isn't, don't propagate it futher.
+ * valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(value, strlen(value));
+ ret = h->validate(BTRFS_I(inode), value, strlen(value));
if (ret)
continue;
@@ -363,8 +422,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
*/
if (need_reserve) {
num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- num_bytes, BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_block_rsv_add(fs_info, trans->block_rsv,
+ num_bytes,
+ BTRFS_RESERVE_NO_FLUSH);
if (ret)
return ret;
}
@@ -383,7 +443,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (need_reserve) {
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- num_bytes);
+ num_bytes, NULL);
if (ret)
return ret;
}
@@ -393,46 +453,6 @@ static int inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *dir)
-{
- if (!dir)
- return 0;
-
- return inherit_props(trans, inode, dir);
-}
-
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root)
-{
- struct super_block *sb = root->fs_info->sb;
- struct btrfs_key key;
- struct inode *parent_inode, *child_inode;
- int ret;
-
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- parent_inode = btrfs_iget(sb, &key, parent_root);
- if (IS_ERR(parent_inode))
- return PTR_ERR(parent_inode);
-
- child_inode = btrfs_iget(sb, &key, root);
- if (IS_ERR(child_inode)) {
- iput(parent_inode);
- return PTR_ERR(child_inode);
- }
-
- ret = inherit_props(trans, child_inode, parent_inode);
- iput(child_inode);
- iput(parent_inode);
-
- return ret;
-}
-
void __init btrfs_props_init(void)
{
int i;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 40b2c65b518c..ca9dd3df129b 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -13,7 +13,9 @@ void __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len);
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
@@ -21,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *dir);
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root);
-
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ff1870ff3474..9334c3157c22 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,7 +11,7 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
-#include <linux/sizes.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
@@ -22,18 +22,8 @@
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
-
-/* TODO XXX FIXME
- * - subvol delete -> delete when ref goes to 0? delete limits also?
- * - reorganize keys
- * - compressed
- * - sync
- * - copy also limits on subvol creation
- * - limit
- * - caches for ulists
- * - performance benchmarks
- * - check all ioctl parameters
- */
+#include "sysfs.h"
+#include "tree-mod-log.h"
/*
* Helpers to access qgroup reservation
@@ -220,7 +210,8 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
@@ -240,7 +231,6 @@ static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
list_del(&list->next_member);
kfree(list);
}
- kfree(qgroup);
}
/* must be called with qgroup_lock held */
@@ -252,20 +242,23 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
return 0;
}
-/* must be called with qgroup_lock held */
-static int add_relation_rb(struct btrfs_fs_info *fs_info,
- u64 memberid, u64 parentid)
+/*
+ * Add relation specified by two qgroups.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the qgroups is NULL
+ * <0 other errors
+ */
+static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup *member;
- struct btrfs_qgroup *parent;
struct btrfs_qgroup_list *list;
- member = find_qgroup_rb(fs_info, memberid);
- parent = find_qgroup_rb(fs_info, parentid);
if (!member || !parent)
return -ENOENT;
@@ -281,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info,
return 0;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add relation specified by two qgroup ids.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the ids does not exist
+ * <0 other errors
+ */
+static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+
+ return __add_relation_rb(member, parent);
+}
+
+/* Must be called with qgroup_lock held */
static int del_relation_rb(struct btrfs_fs_info *fs_info,
u64 memberid, u64 parentid)
{
@@ -320,6 +333,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
}
#endif
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+{
+ fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
+ BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -351,6 +371,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
/* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
@@ -385,7 +408,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
if (btrfs_qgroup_status_generation(l, ptr) !=
fs_info->generation) {
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
@@ -403,7 +426,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
btrfs_err(fs_info, "inconsistent qgroup config");
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
qgroup = add_qgroup_rb(fs_info, found_key.offset);
@@ -412,6 +435,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+
switch (found_key.type) {
case BTRFS_QGROUP_INFO_KEY: {
struct btrfs_qgroup_info_item *ptr;
@@ -488,24 +515,63 @@ next2:
break;
}
out:
+ btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
ret >= 0)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
- btrfs_free_path(path);
if (ret < 0) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
+/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
* quota_root to be null with qgroup_lock held before, so it is safe to clean
@@ -519,7 +585,9 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(qgroup);
+ __del_qgroup_rb(fs_info, qgroup);
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
@@ -528,6 +596,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
*/
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
+ btrfs_sysfs_del_qgroups(fs_info);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
@@ -816,7 +885,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
- btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
@@ -844,8 +914,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
key.objectid = 0;
key.offset = 0;
key.type = 0;
@@ -887,19 +955,53 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_trans_handle *trans = NULL;
+ struct ulist *ulist = NULL;
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "qgroups are currently unsupported in extent tree v2");
+ return -EINVAL;
+ }
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
+ ulist = ulist_alloc(GFP_KERNEL);
+ if (!ulist) {
ret = -ENOMEM;
goto out;
}
+ ret = btrfs_sysfs_add_qgroups(fs_info);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+ * avoid lock acquisition inversion problems (reported by lockdep) between
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+ * start a transaction.
+ * After we started the transaction lock qgroup_ioctl_lock again and
+ * check if someone else created the quota root in the meanwhile. If so,
+ * just return success and release the transaction handle.
+ *
+ * Also we don't need to worry about someone else calling
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+ * that function returns 0 (success) when the sysfs entries already exist.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
@@ -909,12 +1011,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
* would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
+ if (fs_info->quota_root)
+ goto out;
+
+ fs_info->qgroup_ulist = ulist;
+ ulist = NULL;
+
/*
* initially create the quota tree
*/
@@ -950,7 +1060,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
btrfs_mark_buffer_dirty(leaf);
@@ -974,6 +1085,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_ROOT_REF_KEY) {
+
+ /* Release locks on tree_root before we access quota_root */
+ btrfs_release_path(path);
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -987,6 +1102,25 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
+ path, 1, 0);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ if (ret > 0) {
+ /*
+ * Shouldn't happen, but in case it does we
+ * don't need to do the btrfs_next_item, just
+ * continue.
+ */
+ continue;
+ }
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
@@ -1011,9 +1145,25 @@ out_add_root:
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1030,26 +1180,43 @@ out_add_root:
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
btrfs_free_path(path);
out_free_root:
- if (ret) {
- free_extent_buffer(quota_root->node);
- free_extent_buffer(quota_root->commit_root);
- kfree(quota_root);
- }
+ if (ret)
+ btrfs_put_root(quota_root);
out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
- if (trans)
- btrfs_end_transaction(trans);
+ btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ ulist_free(ulist);
return ret;
}
@@ -1059,28 +1226,60 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
/*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done.
+ *
+ * Also, we must always start a transaction without holding the mutex
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ if (!fs_info->quota_root)
+ goto out;
+
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -1088,13 +1287,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
list_del(&quota_root->dirty_list);
@@ -1102,16 +1301,18 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(quota_root->node);
btrfs_clean_tree_block(quota_root->node);
btrfs_tree_unlock(quota_root->node);
- btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
+ btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
- free_extent_buffer(quota_root->node);
- free_extent_buffer(quota_root->commit_root);
- kfree(quota_root);
+ btrfs_put_root(quota_root);
-end_trans:
- ret = btrfs_end_transaction(trans);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+
return ret;
}
@@ -1247,13 +1448,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
+ unsigned int nofs_flag;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
@@ -1288,7 +1493,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = add_relation_rb(fs_info, src, dst);
+ ret = __add_relation_rb(member, parent);
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
@@ -1310,10 +1515,14 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup_list *list;
struct ulist *tmp;
bool found = false;
+ unsigned int nofs_flag;
int ret = 0;
int ret2;
+ /* We hold a transaction handle open, must do a NOFS allocation. */
+ nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
@@ -1402,8 +1611,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
qgroup = add_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
- if (IS_ERR(qgroup))
+ if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
+ goto out;
+ }
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1450,6 +1662,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
+ * spinlock, since the sysfs_remove_group() function needs to take
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
+ */
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -1522,7 +1742,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
ret = update_qgroup_limit_item(trans, qgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
@@ -1567,17 +1787,42 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct ulist *old_root;
u64 bytenr = qrecord->bytenr;
int ret;
- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
+ /*
+ * We are always called in a context where we are already holding a
+ * transaction handle. Often we are called when adding a data delayed
+ * reference from btrfs_truncate_inode_items() (truncating or unlinking),
+ * in which case we will be holding a write lock on extent buffer from a
+ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
+ * acquire fs_info->commit_root_sem, because that is a higher level lock
+ * that must be acquired before locking any extent buffers.
+ *
+ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
+ * but we can't pass it a non-NULL transaction handle, because otherwise
+ * it would not use commit roots and would lock extent buffers, causing
+ * a deadlock if it ends up trying to read lock the same extent buffer
+ * that was previously write locked at btrfs_truncate_inode_items().
+ *
+ * So pass a NULL transaction handle to btrfs_find_all_roots() and
+ * explicitly tell it to not acquire the commit_root_sem - if we are
+ * holding a transaction handle we don't need its protection.
+ */
+ ASSERT(trans != NULL);
+
+ if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ return 0;
+
+ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
+ true);
if (ret < 0) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_warn(fs_info,
+ qgroup_mark_inconsistent(trans->fs_info);
+ btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -1621,7 +1866,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
}
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -1814,34 +2059,22 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
struct btrfs_key dst_key;
if (src_path->nodes[cur_level] == NULL) {
- struct btrfs_key first_key;
struct extent_buffer *eb;
int parent_slot;
- u64 child_gen;
- u64 child_bytenr;
eb = src_path->nodes[cur_level + 1];
parent_slot = src_path->slots[cur_level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- cur_level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ src_path->locks[cur_level] = BTRFS_READ_LOCK;
}
src_path->slots[cur_level] = dst_path->slots[cur_level];
@@ -1936,10 +2169,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
/* Read the tree block if needed */
if (dst_path->nodes[cur_level] == NULL) {
- struct btrfs_key first_key;
int parent_slot;
u64 child_gen;
- u64 child_bytenr;
/*
* dst_path->nodes[root_level] must be initialized before
@@ -1958,31 +2189,23 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
*/
eb = dst_path->nodes[cur_level + 1];
parent_slot = dst_path->slots[cur_level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
/* This node is old, no need to trace */
if (child_gen < last_snapshot)
goto out;
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- cur_level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
dst_path->nodes[cur_level] = eb;
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK;
need_cleanup = true;
}
@@ -2074,7 +2297,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2085,6 +2308,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
int level;
+ u8 drop_subptree_thres;
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
@@ -2094,8 +2318,25 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
+ spin_lock(&fs_info->qgroup_lock);
+ drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * This function only gets called for snapshot drop, if we hit a high
+ * node here, it means we are going to change ownership for quite a lot
+ * of extents, which will greatly slow down btrfs_commit_transaction().
+ *
+ * So here if we find a high tree here, we just skip the accounting and
+ * mark qgroup inconsistent.
+ */
+ if (root_level >= drop_subptree_thres) {
+ qgroup_mark_inconsistent(fs_info);
+ return 0;
+ }
+
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
+ ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@@ -2126,38 +2367,28 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
- struct btrfs_key first_key;
int parent_slot;
- u64 child_gen;
u64 child_bytenr;
/*
- * We need to get child blockptr/gen from parent before
- * we can read it.
+ * We need to get child blockptr from parent before we
+ * can read it.
*/
eb = path->nodes[level + 1];
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
path->nodes[level] = eb;
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize,
@@ -2253,7 +2484,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
@@ -2416,10 +2647,11 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
int ret = 0;
/*
- * If quotas get disabled meanwhile, the resouces need to be freed and
+ * If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
if (new_roots) {
@@ -2515,7 +2747,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
- if (!ret) {
+ if (!ret && !(fs_info->qgroup_flags &
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
/*
* Old roots should be searched when inserting qgroup
* extent record
@@ -2535,12 +2768,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
/*
- * Use SEQ_LAST as time_seq to do special search, which
- * doesn't lock tree or delayed_refs and search current
- * root. It's safe inside commit_transaction().
+ * Use BTRFS_SEQ_LAST as time_seq to do special search,
+ * which doesn't lock tree or delayed_refs and search
+ * current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, SEQ_LAST, &new_roots, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -2588,12 +2821,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -2604,7 +2835,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2626,6 +2857,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
+ bool need_rescan = false;
u32 level_size = 0;
u64 nums;
@@ -2721,7 +2953,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
ret = update_qgroup_limit_item(trans, dstgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info,
"unable to update quota limit for %llu",
dstgroup->qgroupid);
@@ -2769,6 +3001,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto unlock;
}
++i_qgroups;
+
+ /*
+ * If we're doing a snapshot, and adding the snapshot to a new
+ * qgroup, the numbers are guaranteed to be incorrect.
+ */
+ if (srcid)
+ need_rescan = true;
}
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
@@ -2788,6 +3027,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
+
+ /* Manually tweaking numbers certainly needs a rescan */
+ need_rescan = true;
}
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
@@ -2806,30 +3048,23 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
+ need_rescan = true;
}
unlock:
spin_unlock(&fs_info->qgroup_lock);
+ if (!ret)
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (need_rescan)
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
-/*
- * Two limits to commit transaction in advance.
- *
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
- * For SIZE, it will be in byte unit as threshold.
- */
-#define QGROUP_FREE_RATIO 32
-#define QGROUP_FREE_SIZE SZ_32M
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup *qg, u64 num_bytes)
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 free;
- u64 threshold;
-
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2838,32 +3073,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
- /*
- * Even if we passed the check, it's better to check if reservation
- * for meta_pertrans is pushing us near limit.
- * If there is too much pertrans reservation or it's near the limit,
- * let's try commit transaction to free some, using transaction_kthread
- */
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- } else {
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- }
-
- /*
- * Use transaction_kthread to commit transaction, so we no
- * longer need to bother nested transaction nor lock context.
- */
- if (free < threshold)
- btrfs_commit_transaction_locksafe(fs_info);
- }
-
return true;
}
@@ -2911,7 +3120,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
@@ -3036,6 +3245,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *extent_root;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
@@ -3045,7 +3255,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
int ret;
mutex_lock(&fs_info->qgroup_rescan_lock);
- ret = btrfs_search_slot_for_read(fs_info->extent_root,
+ extent_root = btrfs_extent_root(fs_info,
+ fs_info->qgroup_rescan_progress.objectid);
+ ret = btrfs_search_slot_for_read(extent_root,
&fs_info->qgroup_rescan_progress,
path, 1, 0);
@@ -3116,6 +3328,14 @@ out:
return ret;
}
+static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_fs_closing(fs_info) ||
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+}
+
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
@@ -3124,6 +3344,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
struct btrfs_trans_handle *trans = NULL;
int err = -ENOMEM;
int ret = 0;
+ bool stopped = false;
path = btrfs_alloc_path();
if (!path)
@@ -3136,17 +3357,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
path->skip_locking = 1;
err = 0;
- while (!err && !btrfs_fs_closing(fs_info)) {
+ while (!err && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
break;
}
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- err = -EINTR;
- } else {
- err = qgroup_rescan_leaf(trans, path);
- }
+
+ err = qgroup_rescan_leaf(trans, path);
+
if (err > 0)
btrfs_commit_transaction(trans);
else
@@ -3160,7 +3379,7 @@ out:
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0) {
+ } else if (err < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3179,7 +3398,8 @@ out:
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
+ if (!stopped ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
ret = update_qgroup_status_item(trans);
@@ -3190,6 +3410,7 @@ out:
}
}
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
complete_all(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3198,8 +3419,10 @@ out:
btrfs_end_transaction(trans);
- if (btrfs_fs_closing(fs_info)) {
+ if (stopped) {
btrfs_info(fs_info, "qgroup scan paused");
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
+ btrfs_info(fs_info, "qgroup scan cancelled");
} else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
@@ -3237,7 +3460,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- spin_lock(&fs_info->qgroup_lock);
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
@@ -3249,10 +3471,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
@@ -3261,11 +3485,10 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
- fs_info->qgroup_rescan_running = true;
-
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -3326,8 +3549,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
return 0;
}
@@ -3339,9 +3565,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
- spin_lock(&fs_info->qgroup_lock);
running = fs_info->qgroup_rescan_running;
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (!running)
@@ -3363,33 +3587,142 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ }
+}
+
+#define rbtree_iterate_from_safe(node, next, start) \
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
+
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len)
+{
+ struct rb_node *node;
+ struct rb_node *next;
+ struct ulist_node *entry;
+ int ret = 0;
+
+ node = reserved->range_changed.root.rb_node;
+ if (!node)
+ return 0;
+ while (node) {
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ if (entry->val < start)
+ node = node->rb_right;
+ else
+ node = node->rb_left;
+ }
+
+ if (entry->val > start && rb_prev(&entry->rb_node))
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
+ rb_node);
+
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
+ u64 entry_start;
+ u64 entry_end;
+ u64 entry_len;
+ int clear_ret;
+
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ entry_start = entry->val;
+ entry_end = entry->aux;
+ entry_len = entry_end - entry_start + 1;
+
+ if (entry_start >= start + len)
+ break;
+ if (entry_start + entry_len <= start)
+ continue;
+ /*
+ * Now the entry is in [start, start + len), revert the
+ * EXTENT_QGROUP_RESERVED bit.
+ */
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
+ entry_end, EXTENT_QGROUP_RESERVED);
+ if (!ret && clear_ret < 0)
+ ret = clear_ret;
+
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
+ if (likely(reserved->bytes_changed >= entry_len)) {
+ reserved->bytes_changed -= entry_len;
+ } else {
+ WARN_ON(1);
+ reserved->bytes_changed = 0;
+ }
+ }
+
+ return ret;
}
/*
- * Reserve qgroup space for range [start, start + len).
+ * Try to free some space for qgroup.
*
- * This function will either reserve space from related qgroups or doing
- * nothing if the range is already reserved.
+ * For qgroup, there are only 3 ways to free qgroup space:
+ * - Flush nodatacow write
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
+ * In theory, we should only flush nodatacow inodes, but it's not yet
+ * possible, so we need to flush the whole root.
*
- * Return 0 for successful reserve
- * Return <0 for error (including -EQUOT)
+ * - Wait for ordered extents
+ * When ordered extents are finished, their reserved metadata is finally
+ * converted to per_trans status, which can be freed by later commit
+ * transaction.
*
- * NOTE: this function may sleep for memory allocation.
- * if btrfs_qgroup_reserve_data() is called multiple times with
- * same @reserved, caller must ensure when error happens it's OK
- * to free *ALL* reserved space.
+ * - Commit transaction
+ * This would free the meta_per_trans space.
+ * In theory this shouldn't provide much space, but any more qgroup space
+ * is needed.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode,
+static int try_flush_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /* Can't hold an open transaction or we run the risk of deadlocking. */
+ ASSERT(current->journal_info == NULL);
+ if (WARN_ON(current->journal_info))
+ return 0;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
+ ret = btrfs_start_delalloc_snapshot(root, true);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
+ return ret;
+}
+
+static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
+ bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
@@ -3402,6 +3735,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
+ new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
@@ -3409,15 +3743,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
- trace_btrfs_qgroup_reserve_data(inode, start, len,
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
- goto cleanup;
+ goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -3425,23 +3759,48 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
return ret;
cleanup:
- /* cleanup *ALL* already reserved ranges */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
- /* Also free data bytes of already reserved one */
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
- extent_changeset_release(reserved);
+ qgroup_unreserve_range(inode, reserved, start, len);
+out:
+ if (new_reserved) {
+ extent_changeset_free(reserved);
+ *reserved_ret = NULL;
+ }
return ret;
}
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or do nothing
+ * if the range is already reserved.
+ *
+ * Return 0 for successful reservation
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
+ * commit transaction. So caller should not hold any dirty page locked.
+ */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ int ret;
+
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(inode->root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
+
/* Free ranges specified by @reserved, normally in error path */
-static int qgroup_free_reserved_data(struct inode *inode,
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
@@ -3477,8 +3836,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
- free_start, free_start + free_len - 1,
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -3492,7 +3851,7 @@ out:
return ret;
}
-static int __btrfs_qgroup_release_data(struct inode *inode,
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
int free)
{
@@ -3500,8 +3859,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
- &BTRFS_I(inode)->root->fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
return 0;
/* In release case, we shouldn't have @reserved */
@@ -3509,18 +3867,18 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
if (free)
trace_op = QGROUP_FREE;
- trace_btrfs_qgroup_release_data(inode, start, len,
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op);
if (free)
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3540,7 +3898,7 @@ out:
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_free_data(struct inode *inode,
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
@@ -3561,7 +3919,7 @@ int btrfs_qgroup_free_data(struct inode *inode,
*
* NOTE: This function may sleep for memory allocation.
*/
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
@@ -3606,8 +3964,8 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3633,6 +3991,22 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return ret;
}
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush)
+{
+ int ret;
+
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if ((ret <= 0 && ret != -EDQUOT) || noflush)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
+
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3732,7 +4106,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
* Check qgroup reserved space leaking, normally at destroy inode
* time
*/
-void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
struct extent_changeset changeset;
struct ulist_node *unode;
@@ -3740,19 +4114,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
ULIST_ITER_INIT(&iter);
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
- inode->i_ino, unode->val, unode->aux);
+ btrfs_warn(inode->root->fs_info,
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
+ btrfs_ino(inode), unode->val, unode->aux);
}
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
@@ -3910,8 +4284,7 @@ out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -3975,7 +4348,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
/* Read out reloc subtree root */
- reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
block->reloc_generation, block->level,
&block->first_key);
if (IS_ERR(reloc_eb)) {
@@ -3998,7 +4371,7 @@ out:
btrfs_err_rl(fs_info,
"failed to account subtree at bytenr %llu: %d",
subvol_eb->start, ret);
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 1bc654459469..578c77e94200 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -99,6 +100,9 @@
* subtree rescan for them.
*/
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
@@ -223,8 +227,18 @@ struct btrfs_qgroup {
*/
u64 old_refcnt;
u64 new_refcnt;
+
+ /*
+ * Sysfs kobjectid
+ */
+ struct kobject kobj;
};
+static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
/*
* For qgroup event trace points only
*/
@@ -287,7 +301,7 @@ int btrfs_qgroup_trace_extent_nolock(
* using current root, then we can move all expensive backref walk out of
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
/*
@@ -344,26 +358,32 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode,
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
-int btrfs_qgroup_free_data(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
-
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len);
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce);
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ enforce, false);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
- int num_bytes, bool enforce)
+ int num_bytes, bool enforce,
+ bool noflush)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ enforce, noflush);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
@@ -399,7 +419,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
*/
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
@@ -415,5 +435,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a8e53c8e7b01..82c8e991300e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,6 +13,7 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@@ -51,133 +52,21 @@ struct btrfs_stripe_hash_table {
struct btrfs_stripe_hash table[];
};
-enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE,
- BTRFS_RBIO_READ_REBUILD,
- BTRFS_RBIO_PARITY_SCRUB,
- BTRFS_RBIO_REBUILD_MISSING,
-};
-
-struct btrfs_raid_bio {
- struct btrfs_fs_info *fs_info;
- struct btrfs_bio *bbio;
-
- /* while we're doing rmw on a stripe
- * we put it into a hash table so we can
- * lock the stripe and merge more rbios
- * into it.
- */
- struct list_head hash_list;
-
- /*
- * LRU list for the stripe cache
- */
- struct list_head stripe_cache;
-
- /*
- * for scheduling work in the helper threads
- */
- struct btrfs_work work;
-
- /*
- * bio list and bio_list_lock are used
- * to add more bios into the stripe
- * in hopes of avoiding the full rmw
- */
- struct bio_list bio_list;
- spinlock_t bio_list_lock;
-
- /* also protected by the bio_list_lock, the
- * plug list is used by the plugging code
- * to collect partial bios while plugged. The
- * stripe locking code also uses it to hand off
- * the stripe lock to the next pending IO
- */
- struct list_head plug_list;
-
- /*
- * flags that tell us if it is safe to
- * merge with this bio
- */
- unsigned long flags;
-
- /* size of each individual stripe on disk */
- int stripe_len;
-
- /* number of data stripes (no p/q) */
- int nr_data;
-
- int real_stripes;
-
- int stripe_npages;
- /*
- * set if we're doing a parity rebuild
- * for a read from higher up, which is handled
- * differently from a parity rebuild as part of
- * rmw
- */
- enum btrfs_rbio_ops operation;
-
- /* first bad stripe */
- int faila;
-
- /* second bad stripe (for raid6 use) */
- int failb;
-
- int scrubp;
- /*
- * number of pages needed to represent the full
- * stripe
- */
- int nr_pages;
-
- /*
- * size of all the bios in the bio_list. This
- * helps us decide if the rbio maps to a full
- * stripe or not
- */
- int bio_list_bytes;
-
- int generic_bio_cnt;
-
- refcount_t refs;
-
- atomic_t stripes_pending;
-
- atomic_t error;
- /*
- * these are two arrays of pointers. We allocate the
- * rbio big enough to hold them both and setup their
- * locations when the rbio is allocated
- */
-
- /* pointers to pages that we allocated for
- * reading/writing stripes directly from the disk (including P/Q)
- */
- struct page **stripe_pages;
-
- /*
- * pointers to the pages in the bio_list. Stored
- * here for faster lookup
- */
- struct page **bio_pages;
-
- /*
- * bitmap to record which horizontal stripe has data
- */
- unsigned long *dbitmap;
-
- /* allocated with real_stripes-many pointers for finish_*() calls */
- void **finish_pointers;
-
- /* allocated with stripe_npages-many bits for finish_*() calls */
- unsigned long *finish_pbitmap;
+/*
+ * A bvec like structure to present a sector inside a page.
+ *
+ * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ */
+struct sector_ptr {
+ struct page *page;
+ unsigned int pgoff:24;
+ unsigned int uptodate:8;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct btrfs_work *work);
-static void read_rebuild_work(struct btrfs_work *work);
+static void rmw_work(struct work_struct *work);
+static void read_rebuild_work(struct work_struct *work);
static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
static void __free_raid_bio(struct btrfs_raid_bio *rbio);
@@ -186,12 +75,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check);
-static void scrub_parity_work(struct btrfs_work *work);
+static void scrub_parity_work(struct work_struct *work);
-static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
+static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
- btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+ INIT_WORK(&rbio->work, work_func);
+ queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -206,7 +95,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
- int table_size;
if (info->stripe_hash_table)
return 0;
@@ -218,8 +106,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
* Try harder to allocate and fallback to vmalloc to lower the chance
* of a failing mount.
*/
- table_size = sizeof(*table) + sizeof(*h) * num_entries;
- table = kvzalloc(table_size, GFP_KERNEL);
+ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -235,14 +122,13 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
- if (x)
- kvfree(x);
+ kvfree(x);
return 0;
}
/*
* caching an rbio means to copy anything from the
- * bio_pages array into the stripe_pages array. We
+ * bio_sectors array into the stripe_pages array. We
* use the page uptodate bit in the stripe cache array
* to indicate if it has valid data
*
@@ -252,26 +138,24 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
{
int i;
- char *s;
- char *d;
int ret;
ret = alloc_rbio_pages(rbio);
if (ret)
return;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (!rbio->bio_pages[i])
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ /* Some range not covered by bio (partial write), skip it */
+ if (!rbio->bio_sectors[i].page)
continue;
- s = kmap(rbio->bio_pages[i]);
- d = kmap(rbio->stripe_pages[i]);
-
- copy_page(d, s);
-
- kunmap(rbio->bio_pages[i]);
- kunmap(rbio->stripe_pages[i]);
- SetPageUptodate(rbio->stripe_pages[i]);
+ ASSERT(rbio->stripe_sectors[i].page);
+ memcpy_page(rbio->stripe_sectors[i].page,
+ rbio->stripe_sectors[i].pgoff,
+ rbio->bio_sectors[i].page,
+ rbio->bio_sectors[i].pgoff,
+ rbio->bioc->fs_info->sectorsize);
+ rbio->stripe_sectors[i].uptodate = 1;
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
@@ -281,7 +165,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->bbio->raid_map[0];
+ u64 num = rbio->bioc->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -294,32 +178,86 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
+static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page;
+ i++) {
+ if (!rbio->stripe_sectors[i].uptodate)
+ return false;
+ }
+ return true;
+}
+
/*
- * stealing an rbio means taking all the uptodate pages from the stripe
- * array in the source rbio and putting them into the destination rbio
+ * Update the stripe_sectors[] array to use correct page and pgoff
+ *
+ * Should be called every time any page pointer in stripes_pages[] got modified.
+ */
+static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ u32 offset;
+ int i;
+
+ for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ int page_index = offset >> PAGE_SHIFT;
+
+ ASSERT(page_index < rbio->nr_pages);
+ rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
+ rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ }
+}
+
+static void steal_rbio_page(struct btrfs_raid_bio *src,
+ struct btrfs_raid_bio *dest, int page_nr)
+{
+ const u32 sectorsize = src->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ if (dest->stripe_pages[page_nr])
+ __free_page(dest->stripe_pages[page_nr]);
+ dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
+ src->stripe_pages[page_nr] = NULL;
+
+ /* Also update the sector->uptodate bits. */
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page; i++)
+ dest->stripe_sectors[i].uptodate = true;
+}
+
+/*
+ * Stealing an rbio means taking all the uptodate pages from the stripe array
+ * in the source rbio and putting them into the destination rbio.
+ *
+ * This will also update the involved stripe_sectors[] which are referring to
+ * the old pages.
*/
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
struct page *s;
- struct page *d;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
for (i = 0; i < dest->nr_pages; i++) {
s = src->stripe_pages[i];
- if (!s || !PageUptodate(s)) {
+ if (!s || !full_page_sectors_uptodate(src, i))
continue;
- }
- d = dest->stripe_pages[i];
- if (d)
- __free_page(d);
-
- dest->stripe_pages[i] = s;
- src->stripe_pages[i] = NULL;
+ steal_rbio_page(src, dest, i);
}
+ index_stripe_sectors(dest);
+ index_stripe_sectors(src);
}
/*
@@ -334,7 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
- dest->generic_bio_cnt += victim->generic_bio_cnt;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
+ dest->stripe_nsectors);
bio_list_init(&victim->bio_list);
}
@@ -355,7 +295,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be
@@ -410,7 +350,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
__remove_rbio_from_cache(rbio);
@@ -470,7 +410,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
- table = rbio->fs_info->stripe_hash_table;
+ table = rbio->bioc->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
spin_lock(&rbio->bio_list_lock);
@@ -533,9 +473,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio)
int ret = 1;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
- if (size != rbio->nr_data * rbio->stripe_len)
+ if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
ret = 0;
- BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
return ret;
@@ -569,8 +509,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->bbio->raid_map[0] !=
- cur->bbio->raid_map[0])
+ if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -612,39 +551,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
-static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return stripe * rbio->stripe_npages + index;
+ ASSERT(stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
+
+ return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
-/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+/* Return a sector from rbio->stripe_sectors, not from the bio list */
+static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+ return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
+ sector_nr)];
}
-/*
- * helper to index into the pstripe
- */
-static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside P stripe */
+static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
- return rbio_stripe_page(rbio, rbio->nr_data, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
}
-/*
- * helper to index into the qstripe, returns null
- * if there is no qstripe
- */
-static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside Q stripe, return NULL if not RAID6 */
+static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
- return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
}
/*
@@ -679,11 +618,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
- h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+ h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
continue;
spin_lock(&cur->bio_list_lock);
@@ -761,7 +700,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
int keep_cache = 0;
bucket = rbio_bucket(rbio);
- h = rbio->fs_info->stripe_hash_table->table + bucket;
+ h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
@@ -848,7 +787,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- btrfs_put_bbio(rbio->bbio);
+ btrfs_put_bioc(rbio->bioc);
kfree(rbio);
}
@@ -874,8 +813,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
- if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -916,54 +859,50 @@ static void raid_write_end_io(struct bio *bio)
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
- 0 : rbio->bbio->max_errors;
+ 0 : rbio->bioc->max_errors;
if (atomic_read(&rbio->error) > max_errors)
err = BLK_STS_IOERR;
rbio_orig_end_io(rbio, err);
}
-/*
- * the read/modify/write code wants to use the original bio for
- * any pages it included, and then use the rbio for everything
- * else. This function decides if a given index (stripe number)
- * and page number in that stripe fall inside the original bio
- * or the rbio.
+/**
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr
*
- * if you set bio_list_only, you'll get a NULL back for any ranges
- * that are outside the bio_list
+ * @rbio: The raid bio
+ * @stripe_nr: Stripe number, valid range [0, real_stripe)
+ * @sector_nr: Sector number inside the stripe,
+ * valid range [0, stripe_nsectors)
+ * @bio_list_only: Whether to use sectors inside the bio list only.
*
- * This doesn't take any refs on anything, you get a bare page pointer
- * and the caller must bump refs as required.
- *
- * You must call index_rbio_pages once before you can trust
- * the answers from this function.
+ * The read/modify/write code wants to reuse the original bio page as much
+ * as possible, and only use stripe_sectors as fallback.
*/
-static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
- int index, int pagenr, int bio_list_only)
+static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
{
- int chunk_page;
- struct page *p = NULL;
+ struct sector_ptr *sector;
+ int index;
- chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+
+ index = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(index >= 0 && index < rbio->nr_sectors);
spin_lock_irq(&rbio->bio_list_lock);
- p = rbio->bio_pages[chunk_page];
+ sector = &rbio->bio_sectors[index];
+ if (sector->page || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (!sector->page)
+ sector = NULL;
+ spin_unlock_irq(&rbio->bio_list_lock);
+ return sector;
+ }
spin_unlock_irq(&rbio->bio_list_lock);
- if (p || bio_list_only)
- return p;
-
- return rbio->stripe_pages[chunk_page];
-}
-
-/*
- * number of pages we need for the entire stripe across all the
- * drives
- */
-static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
-{
- return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
+ return &rbio->stripe_sectors[index];
}
/*
@@ -971,23 +910,30 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_bio *bbio,
- u64 stripe_len)
-{
+ struct btrfs_io_context *bioc)
+{
+ const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
+ const unsigned int num_pages = stripe_npages * real_stripes;
+ const unsigned int stripe_nsectors =
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+ const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- int nr_data = 0;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
- int num_pages = rbio_nr_pages(stripe_len, real_stripes);
- int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
+ /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * Our current stripe len should be fixed to 64k thus stripe_nsectors
+ * (at most 16) should be no larger than BITS_PER_LONG.
+ */
+ ASSERT(stripe_nsectors <= BITS_PER_LONG);
+
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_pages) * num_pages +
- sizeof(*rbio->finish_pointers) * real_stripes +
- sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
- sizeof(*rbio->finish_pbitmap) *
- BITS_TO_LONGS(stripe_npages),
+ sizeof(*rbio->bio_sectors) * num_sectors +
+ sizeof(*rbio->stripe_sectors) * num_sectors +
+ sizeof(*rbio->finish_pointers) * real_stripes,
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -997,12 +943,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
- rbio->bbio = bbio;
- rbio->fs_info = fs_info;
- rbio->stripe_len = stripe_len;
+ btrfs_get_bioc(bioc);
+ rbio->bioc = bioc;
rbio->nr_pages = num_pages;
+ rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
+ rbio->stripe_nsectors = stripe_nsectors;
rbio->faila = -1;
rbio->failb = -1;
refcount_set(&rbio->refs, 1);
@@ -1010,8 +957,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
atomic_set(&rbio->stripes_pending, 0);
/*
- * the stripe_pages, bio_pages, etc arrays point to the extra
- * memory we allocated past the end of the rbio
+ * The stripe_pages, bio_sectors, etc arrays point to the extra memory
+ * we allocated past the end of the rbio.
*/
p = rbio + 1;
#define CONSUME_ALLOC(ptr, count) do { \
@@ -1019,80 +966,76 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
} while (0)
CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
+ CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
- CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
- CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
- nr_data = real_stripes - 1;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
- nr_data = real_stripes - 2;
- else
- BUG();
+ ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
+ rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
- rbio->nr_data = nr_data;
return rbio;
}
/* allocate pages for all the stripes in the bio, including parity */
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ int ret;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ if (ret < 0)
+ return ret;
+ /* Mapping all sectors */
+ index_stripe_sectors(rbio);
return 0;
}
/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
+ int ret;
- i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
+ ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
+ rbio->stripe_pages + data_pages);
+ if (ret < 0)
+ return ret;
- for (; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * add a single page from a specific stripe into our list of bios for IO
- * this will try to merge into existing bios if possible, and returns
- * zero if all went well.
+ * Add a single sector @sector into our list of bios for IO.
+ *
+ * Return 0 if everything went well.
+ * Return <0 for error.
*/
-static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct page *page,
- int stripe_nr,
- unsigned long page_index,
- unsigned long bio_max_len)
-{
+static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct sector_ptr *sector,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ enum req_op op)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio *last = bio_list->tail;
- u64 last_end = 0;
int ret;
struct bio *bio;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
u64 disk_start;
- stripe = &rbio->bbio->stripes[stripe_nr];
- disk_start = stripe->physical + (page_index << PAGE_SHIFT);
+ /*
+ * Note: here stripe_nr has taken device replace into consideration,
+ * thus it can be larger than rbio->real_stripe.
+ * So here we check against bioc->num_stripes, not rbio->real_stripes.
+ */
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT(sector->page);
+
+ stripe = &rbio->bioc->stripes[stripe_nr];
+ disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
@@ -1100,30 +1043,30 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
- if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_status &&
- last->bi_disk == stripe->dev->bdev->bd_disk &&
- last->bi_partno == stripe->dev->bdev->bd_partno) {
- ret = bio_add_page(last, page, PAGE_SIZE, 0);
- if (ret == PAGE_SIZE)
+ if (last_end == disk_start && !last->bi_status &&
+ last->bi_bdev == stripe->dev->bdev) {
+ ret = bio_add_page(last, sector->page, sectorsize,
+ sector->pgoff);
+ if (ret == sectorsize)
return 0;
}
}
/* put a new bio on the list */
- bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- bio->bi_iter.bi_size = 0;
- bio_set_dev(bio, stripe->dev->bdev);
+ bio = bio_alloc(stripe->dev->bdev,
+ max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
+ op, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> 9;
+ bio->bi_private = rbio;
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1145,6 +1088,29 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
}
}
+static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+
+ bio_for_each_segment(bvec, bio, iter) {
+ u32 bvec_offset;
+
+ for (bvec_offset = 0; bvec_offset < bvec.bv_len;
+ bvec_offset += sectorsize, offset += sectorsize) {
+ int index = offset / sectorsize;
+ struct sector_ptr *sector = &rbio->bio_sectors[index];
+
+ sector->page = bvec.bv_page;
+ sector->pgoff = bvec.bv_offset + bvec_offset;
+ ASSERT(sector->pgoff < PAGE_SIZE);
+ }
+ }
+}
+
/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
@@ -1156,29 +1122,40 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
- u64 start;
- unsigned long stripe_offset;
- unsigned long page_index;
spin_lock_irq(&rbio->bio_list_lock);
- bio_list_for_each(bio, &rbio->bio_list) {
- struct bio_vec bvec;
- struct bvec_iter iter;
- int i = 0;
+ bio_list_for_each(bio, &rbio->bio_list)
+ index_one_bio(rbio, bio);
+
+ spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+ struct raid56_bio_trace_info *trace_info)
+{
+ const struct btrfs_io_context *bioc = rbio->bioc;
+ int i;
- start = (u64)bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bbio->raid_map[0];
- page_index = stripe_offset >> PAGE_SHIFT;
+ ASSERT(bioc);
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_io_bio(bio)->iter;
+ /* We rely on bio->bi_bdev to find the stripe number. */
+ if (!bio->bi_bdev)
+ goto not_found;
- bio_for_each_segment(bvec, bio, iter) {
- rbio->bio_pages[page_index + i] = bvec.bv_page;
- i++;
- }
+ for (i = 0; i < bioc->num_stripes; i++) {
+ if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
+ continue;
+ trace_info->stripe_nr = i;
+ trace_info->devid = bioc->stripes[i].dev->devid;
+ trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ bioc->stripes[i].physical;
+ return;
}
- spin_unlock_irq(&rbio->bio_list_lock);
+
+not_found:
+ trace_info->devid = -1;
+ trace_info->offset = -1;
+ trace_info->stripe_nr = -1;
}
/*
@@ -1191,27 +1168,31 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
+ /* The total sector number inside the full stripe. */
+ int total_sector_nr;
int stripe;
- int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
+ /* Sector number inside a stripe. */
+ int sectornr;
+ bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
int ret;
bio_list_init(&bio_list);
- if (rbio->real_stripes - rbio->nr_data == 1) {
- p_stripe = rbio->real_stripes - 1;
- } else if (rbio->real_stripes - rbio->nr_data == 2) {
- p_stripe = rbio->real_stripes - 2;
- q_stripe = rbio->real_stripes - 1;
- } else {
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
BUG();
- }
+
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
@@ -1242,103 +1223,123 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
- /* first collect one page from each data stripe */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+ /* First collect one sector from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
- /* then add the parity stripe */
- p = rbio_pstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
-
- if (q_stripe != -1) {
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+ if (has_qstripe) {
/*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
*/
- p = rbio_qstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
-
-
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
}
/*
- * time to start writing. Make bios for everything from the
- * higher layers (the bio_list in our rbio) and our p/q. Ignore
- * everything else.
+ * Start writing. Make bios for everything from the higher layers (the
+ * bio_list in our rbio) and our P/Q. Ignore everything else.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
- if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
- continue;
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list,
- page, stripe, pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
- if (likely(!bbio->num_tgtdevs))
+ if (likely(!bioc->num_tgtdevs))
goto write_data;
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bbio->tgtdev_map[stripe])
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ if (!bioc->tgtdev_map[stripe]) {
+ /*
+ * We can skip the whole stripe completely, note
+ * total_sector_nr will be increased by one anyway.
+ */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
continue;
+ }
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
- if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
- continue;
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- rbio->bbio->tgtdev_map[stripe],
- pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ rbio->bioc->tgtdev_map[stripe],
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
write_data:
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
- bio->bi_private = rbio;
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -1359,20 +1360,15 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
- u64 stripe_start;
int i;
- struct btrfs_bio_stripe *stripe;
+ struct btrfs_io_stripe *stripe;
physical <<= 9;
- for (i = 0; i < rbio->bbio->num_stripes; i++) {
- stripe = &rbio->bbio->stripes[i];
- stripe_start = stripe->physical;
- if (physical >= stripe_start &&
- physical < stripe_start + rbio->stripe_len &&
- stripe->dev->bdev &&
- bio->bi_disk == stripe->dev->bdev->bd_disk &&
- bio->bi_partno == stripe->dev->bdev->bd_partno) {
+ for (i = 0; i < rbio->bioc->num_stripes; i++) {
+ stripe = &rbio->bioc->stripes[i];
+ if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
+ stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
}
@@ -1387,18 +1383,14 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = bio->bi_iter.bi_sector;
- u64 stripe_start;
+ u64 logical = bio->bi_iter.bi_sector << 9;
int i;
- logical <<= 9;
-
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->bbio->raid_map[i];
- if (logical >= stripe_start &&
- logical < stripe_start + rbio->stripe_len) {
+ u64 stripe_start = rbio->bioc->raid_map[i];
+
+ if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
return i;
- }
}
return -1;
}
@@ -1450,56 +1442,89 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
}
/*
+ * For subpage case, we can no longer set page Uptodate directly for
+ * stripe_pages[], thus we need to locate the sector.
+ */
+static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
+ struct page *page,
+ unsigned int pgoff)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
+ if (sector->page == page && sector->pgoff == pgoff)
+ return sector;
+ }
+ return NULL;
+}
+
+/*
* this sets each page in the bio uptodate. It should only be used on private
* rbio pages, nothing that comes in from the higher layers
*/
-static void set_bio_pages_uptodate(struct bio *bio)
+static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- SetPageUptodate(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct sector_ptr *sector;
+ int pgoff;
+
+ for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
+ pgoff += sectorsize) {
+ sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
+ ASSERT(sector);
+ if (sector)
+ sector->uptodate = 1;
+ }
+ }
}
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+ &rbio->end_io_work);
+}
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
- goto cleanup;
+/*
+ * End io handler for the read phase of the RMW cycle. All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ return;
+ }
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write but if there
+ * are any failed stripes we'll reconstruct from parity first.
*/
validate_rbio_for_rmw(rbio);
- return;
-
-cleanup:
-
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
@@ -1510,9 +1535,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
+ const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
int ret;
- int pagenr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -1524,36 +1549,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
- /*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
- */
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
- continue;
+ /* Build a list of bios to read all the missing data sectors. */
+ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
- page = rbio_stripe_page(rbio, stripe, pagenr);
- /*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
- */
- if (PageUptodate(page))
- continue;
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a page
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate page. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -1568,21 +1591,20 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
+ INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid56_bio_end_io;
- bio->bi_private = rbio;
- bio->bi_end_io = raid_rmw_end_io;
- bio->bi_opf = REQ_OP_READ;
-
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_read_partial_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_read_partial(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -1610,10 +1632,8 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio)
int ret;
ret = alloc_rbio_parity_pages(rbio);
- if (ret) {
- __free_raid_bio(rbio);
+ if (ret)
return ret;
- }
ret = lock_stripe_add(rbio);
if (ret == 0)
@@ -1661,18 +1681,19 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct btrfs_work work;
+ struct work_struct work;
};
/*
* rbios on the plug list are sorted for easier merging.
*/
-static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
- struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
- plug_list);
- struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
- plug_list);
+ const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
@@ -1728,7 +1749,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
* if the unplug comes from schedule, we have to push the
* work off to a helper thread
*/
-static void unplug_work(struct btrfs_work *work)
+static void unplug_work(struct work_struct *work)
{
struct btrfs_plug_cb *plug;
plug = container_of(work, struct btrfs_plug_cb, work);
@@ -1741,36 +1762,58 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
- btrfs_queue_work(plug->info->rmw_workers,
- &plug->work);
+ INIT_WORK(&plug->work, unplug_work);
+ queue_work(plug->info->rmw_workers, &plug->work);
return;
}
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bioc->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ fs_info->sectorsize_bits) % rbio->stripe_nsectors;
+
+ set_bit(bit, &rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len)
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
- int ret;
+ int ret = 0;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- btrfs_put_bbio(bbio);
- return PTR_ERR(rbio);
+ ret = PTR_ERR(rbio);
+ goto fail;
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
-
- btrfs_bio_counter_inc_noblocked(fs_info);
- rbio->generic_bio_cnt = 1;
+ rbio_add_bio(rbio, bio);
/*
* don't plug on full rbios, just get them out the door
@@ -1778,9 +1821,11 @@ int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
*/
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
- if (ret)
- btrfs_bio_counter_dec(fs_info);
- return ret;
+ if (ret) {
+ __free_raid_bio(rbio);
+ goto fail;
+ }
+ return;
}
cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
@@ -1791,13 +1836,19 @@ int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- ret = 0;
} else {
ret = __raid56_parity_write(rbio);
- if (ret)
- btrfs_bio_counter_dec(fs_info);
+ if (ret) {
+ __free_raid_bio(rbio);
+ goto fail;
+ }
}
- return ret;
+
+ return;
+
+fail:
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
}
/*
@@ -1807,19 +1858,34 @@ int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
*/
static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
- int pagenr, stripe;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int sectornr, stripe;
void **pointers;
+ void **unmap_array;
int faila = -1, failb = -1;
- struct page *page;
blk_status_t err;
int i;
+ /*
+ * This array stores the pointer for each sector, thus it has the extra
+ * pgoff value added from each sector
+ */
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = BLK_STS_RESOURCE;
goto cleanup_io;
}
+ /*
+ * Store copy of pointers that does not get reordered during
+ * reconstruction so that kunmap_local works.
+ */
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!unmap_array) {
+ err = BLK_STS_RESOURCE;
+ goto cleanup_pointers;
+ }
+
faila = rbio->faila;
failb = rbio->failb;
@@ -1832,39 +1898,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(pagenr, rbio->dbitmap))
+ !test_bit(sectornr, &rbio->dbitmap))
continue;
- /* setup our array of pointers with pages
- * from each stripe
+ /*
+ * Setup our array of pointers with sectors from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
- * if we're rebuilding a read, we have to use
+ * If we're rebuilding a read, we have to use
* pages from the bio list
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- pointers[stripe] = kmap(page);
+ ASSERT(sector->page);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
+ unmap_array[stripe] = pointers[stripe];
}
- /* all raid6 handling here */
- if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /*
- * single failure, rebuild from parity raid5
- * style
- */
+ /* All raid6 handling here */
+ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+ /* Single failure, rebuild from parity raid5 style */
if (failb < 0) {
if (faila == rbio->nr_data) {
/*
@@ -1883,11 +1954,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* make sure our ps and qs are in order */
- if (faila > failb) {
- int tmp = failb;
- failb = faila;
- faila = tmp;
- }
+ if (faila > failb)
+ swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
@@ -1895,8 +1963,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bbio->raid_map[faila] ==
+ if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bioc->raid_map[faila] ==
RAID5_P_STRIPE) {
err = BLK_STS_IOERR;
goto cleanup;
@@ -1908,12 +1976,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
- PAGE_SIZE, faila, pointers);
+ sectorsize, faila, pointers);
} else {
raid6_2data_recov(rbio->real_stripes,
- PAGE_SIZE, faila, failb,
+ sectorsize, faila, failb,
pointers);
}
} else {
@@ -1923,7 +1991,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
BUG_ON(failb != -1);
pstripe:
/* Copy parity block into failed block to start with */
- copy_page(pointers[faila], pointers[rbio->nr_data]);
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
/* rearrange the pointer array */
p = pointers[faila];
@@ -1932,7 +2000,7 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
@@ -1941,35 +2009,25 @@ pstripe:
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_npages; i++) {
+ for (i = 0; i < rbio->stripe_nsectors; i++) {
if (faila != -1) {
- page = rbio_stripe_page(rbio, faila, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, faila, i);
+ sector->uptodate = 1;
}
if (failb != -1) {
- page = rbio_stripe_page(rbio, failb, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, failb, i);
+ sector->uptodate = 1;
}
}
}
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- /*
- * if we're rebuilding a read, we have to use
- * pages from the bio list
- */
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
- kunmap(page);
- }
+ for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
+ kunmap_local(unmap_array[stripe]);
}
err = BLK_STS_OK;
cleanup:
+ kfree(unmap_array);
+cleanup_pointers:
kfree(pointers);
cleanup_io:
@@ -2017,27 +2075,15 @@ cleanup_io:
}
/*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
*/
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
- /*
- * we only read stripe pages off the disk, set them
- * up to date if there were no errors
- */
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(bio);
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
-
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
@@ -2056,8 +2102,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2069,33 +2114,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
if (rbio->faila == stripe || rbio->failb == stripe) {
atomic_inc(&rbio->error);
+ /* Skip the current stripe. */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
continue;
}
-
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
- ret = rbio_add_io_page(rbio, &bio_list,
- rbio_stripe_page(rbio, stripe, pagenr),
- stripe, pagenr, rbio->stripe_len);
- if (ret < 0)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret < 0)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2105,33 +2148,32 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
- if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+ if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
__raid_recover_end_io(rbio);
- goto out;
+ return 0;
} else {
goto cleanup;
}
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
- bio->bi_private = rbio;
- bio->bi_end_io = raid_recover_end_io;
- bio->bi_opf = REQ_OP_READ;
+ INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_recover_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
-out:
+
return 0;
cleanup:
@@ -2151,46 +2193,30 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io)
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- int ret;
- if (generic_io) {
- ASSERT(bbio->mirror_num == mirror_num);
- btrfs_io_bio(bio)->mirror_num = mirror_num;
- }
-
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- if (generic_io)
- btrfs_put_bbio(bbio);
- return PTR_ERR(rbio);
+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
+ goto out_end_bio;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
btrfs_warn(fs_info,
- "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
- __func__, (u64)bio->bi_iter.bi_sector << 9,
- (u64)bio->bi_iter.bi_size, bbio->map_type);
- if (generic_io)
- btrfs_put_bbio(bbio);
- kfree(rbio);
- return -EIO;
- }
-
- if (generic_io) {
- btrfs_bio_counter_inc_noblocked(fs_info);
- rbio->generic_bio_cnt = 1;
- } else {
- btrfs_get_bbio(bbio);
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
+ __func__, bio->bi_iter.bi_sector << 9,
+ (u64)bio->bi_iter.bi_size, bioc->map_type);
+ __free_raid_bio(rbio);
+ bio->bi_status = BLK_STS_IOERR;
+ goto out_end_bio;
}
/*
@@ -2210,27 +2236,21 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->failb--;
}
- ret = lock_stripe_add(rbio);
+ if (lock_stripe_add(rbio))
+ return;
/*
- * __raid56_parity_recover will end the bio with
- * any errors it hits. We don't want to return
- * its error value up the stack because our caller
- * will end up calling bio_endio with any nonzero
- * return
+ * This adds our rbio to the list of rbios that will be handled after
+ * the current lock owner is done.
*/
- if (ret == 0)
- __raid56_parity_recover(rbio);
- /*
- * our rbio has been added to the list of
- * rbios that will be handled after the
- * currently lock owner is done
- */
- return 0;
+ __raid56_parity_recover(rbio);
+ return;
+out_end_bio:
+ bio_endio(bio);
}
-static void rmw_work(struct btrfs_work *work)
+static void rmw_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2238,7 +2258,7 @@ static void rmw_work(struct btrfs_work *work)
raid56_rmw_stripe(rbio);
}
-static void read_rebuild_work(struct btrfs_work *work)
+static void read_rebuild_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2249,23 +2269,23 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
*
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
*/
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bbio, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2277,45 +2297,37 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
/*
- * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+ * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
* to the end position, so this search can start from the first parity
* stripe.
*/
for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
- if (bbio->stripes[i].dev == scrub_dev) {
+ if (bioc->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
}
ASSERT(i < rbio->real_stripes);
- /* Now we just support the sectorsize equals to page size */
- ASSERT(fs_info->sectorsize == PAGE_SIZE);
- ASSERT(rbio->stripe_npages == stripe_nsectors);
- bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
-
- /*
- * We have already increased bio_counter when getting bbio, record it
- * so we can free it at rbio_orig_end_io().
- */
- rbio->generic_bio_cnt = 1;
-
+ bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
return rbio;
}
/* Used for both parity scrub and missing. */
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical)
+ unsigned int pgoff, u64 logical)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
int stripe_offset;
int index;
- ASSERT(logical >= rbio->bbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
- rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
- index = stripe_offset >> PAGE_SHIFT;
- rbio->bio_pages[index] = page;
+ ASSERT(logical >= rbio->bioc->raid_map[0]);
+ ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
+ BTRFS_STRIPE_LEN * rbio->nr_data);
+ stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
+ index = stripe_offset / sectorsize;
+ rbio->bio_sectors[index].page = page;
+ rbio->bio_sectors[index].pgoff = pgoff;
}
/*
@@ -2324,39 +2336,41 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- int bit;
- int index;
- struct page *page;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int total_sector_nr;
- for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
- for (i = 0; i < rbio->real_stripes; i++) {
- index = i * rbio->stripe_npages + bit;
- if (rbio->stripe_pages[index])
- continue;
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct page *page;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[index] = page;
- }
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+ if (rbio->stripe_pages[index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
}
+ index_stripe_sectors(rbio);
return 0;
}
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
- struct btrfs_bio *bbio = rbio->bbio;
+ struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- unsigned long *pbitmap = rbio->finish_pbitmap;
+ unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
- int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
- struct page *p_page = NULL;
- struct page *q_page = NULL;
+ int sectornr;
+ bool has_qstripe;
+ struct sector_ptr p_sector = { 0 };
+ struct sector_ptr q_sector = { 0 };
struct bio_list bio_list;
struct bio *bio;
int is_replace = 0;
@@ -2364,18 +2378,16 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bio_list_init(&bio_list);
- if (rbio->real_stripes - rbio->nr_data == 1) {
- p_stripe = rbio->real_stripes - 1;
- } else if (rbio->real_stripes - rbio->nr_data == 2) {
- p_stripe = rbio->real_stripes - 2;
- q_stripe = rbio->real_stripes - 1;
- } else {
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
BUG();
- }
- if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+ if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
- bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+ bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
}
/*
@@ -2388,68 +2400,73 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!need_check)
goto writeback;
- p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!p_page)
+ p_sector.page = alloc_page(GFP_NOFS);
+ if (!p_sector.page)
goto cleanup;
- SetPageUptodate(p_page);
-
- if (q_stripe != -1) {
- q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!q_page) {
- __free_page(p_page);
+ p_sector.pgoff = 0;
+ p_sector.uptodate = 1;
+
+ if (has_qstripe) {
+ /* RAID6, allocate and map temp space for the Q stripe */
+ q_sector.page = alloc_page(GFP_NOFS);
+ if (!q_sector.page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
goto cleanup;
}
- SetPageUptodate(q_page);
+ q_sector.pgoff = 0;
+ q_sector.uptodate = 1;
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
atomic_set(&rbio->error, 0);
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *p;
+ /* Map the parity stripe just once */
+ pointers[nr_data] = kmap_local_page(p_sector.page);
+
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
void *parity;
+
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
- /* then add the parity stripe */
- pointers[stripe++] = kmap(p_page);
-
- if (q_stripe != -1) {
-
- /*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
- */
- pointers[stripe++] = kmap(q_page);
-
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ if (has_qstripe) {
+ /* RAID6, call the library function to fill in our P/Q */
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
/* Check scrubbing parity and repair it */
- p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- parity = kmap(p);
- if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
- copy_page(parity, pointers[rbio->scrubp]);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ parity = kmap_local_page(sector->page) + sector->pgoff;
+ if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
+ memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
/* Parity is right, needn't writeback */
- bitmap_clear(rbio->dbitmap, pagenr, 1);
- kunmap(p);
+ bitmap_clear(&rbio->dbitmap, sectornr, 1);
+ kunmap_local(parity);
- for (stripe = 0; stripe < nr_data; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
- kunmap(p_page);
+ for (stripe = nr_data - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
}
- __free_page(p_page);
- if (q_page)
- __free_page(q_page);
+ kunmap_local(pointers[nr_data]);
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
+ if (q_sector.page) {
+ kunmap_local(pointers[rbio->real_stripes - 1]);
+ __free_page(q_sector.page);
+ q_sector.page = NULL;
+ }
writeback:
/*
@@ -2457,12 +2474,12 @@ writeback:
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list,
- page, rbio->scrubp, pagenr, rbio->stripe_len);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2470,13 +2487,13 @@ writeback:
if (!is_replace)
goto submit_write;
- for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list, page,
- bbio->tgtdev_map[rbio->scrubp],
- pagenr, rbio->stripe_len);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ bioc->tgtdev_map[rbio->scrubp],
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2491,15 +2508,15 @@ submit_write:
atomic_set(&rbio->stripes_pending, nr_data);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
- bio->bi_private = rbio;
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
+ if (trace_raid56_scrub_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -2527,7 +2544,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
*/
static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
{
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
goto cleanup;
if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2548,7 +2565,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
* the data, so the capability of the repair is declined.
* (In the case of RAID5, we can not repair anything)
*/
- if (dfail > rbio->bbio->max_errors - 1)
+ if (dfail > rbio->bioc->max_errors - 1)
goto cleanup;
/*
@@ -2587,24 +2604,14 @@ cleanup:
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write, but if there
+ * are any failed stripes we'll reconstruct from parity first
*/
validate_rbio_for_parity_scrub(rbio);
}
@@ -2614,8 +2621,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2625,36 +2631,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
goto cleanup;
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
- /*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
- */
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
- continue;
+ /* Build a list of bios to read all the missing parts. */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
+ /* No data in the vertical stripe, no need to read. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- page = rbio_stripe_page(rbio, stripe, pagenr);
- /*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
- */
- if (PageUptodate(page))
- continue;
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a sector
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate sector. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2669,21 +2677,20 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
}
/*
- * the bbio may be freed once we submit the last bio. Make sure
- * not to touch it after that
+ * The bioc may be freed once we submit the last bio. Make sure not to
+ * touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
- bio->bi_private = rbio;
- bio->bi_end_io = raid56_parity_scrub_end_io;
- bio->bi_opf = REQ_OP_READ;
+ INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
+ while ((bio = bio_list_pop(&bio_list))) {
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -2701,7 +2708,7 @@ finish:
validate_rbio_for_parity_scrub(rbio);
}
-static void scrub_parity_work(struct btrfs_work *work)
+static void scrub_parity_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2718,12 +2725,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
{
+ struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bbio, length);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
@@ -2737,17 +2744,13 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
- BUG();
- kfree(rbio);
+ btrfs_warn_rl(fs_info,
+ "can not determine the failed stripe number for full stripe %llu",
+ bioc->raid_map[0]);
+ __free_raid_bio(rbio);
return NULL;
}
- /*
- * When we get bbio, we have already increased bio_counter, record it
- * so we can free it at rbio_orig_end_io()
- */
- rbio->generic_bio_cnt = 1;
-
return rbio;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2503485db859..91d5c0adad15 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,48 +7,177 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
-static inline int nr_parity_stripes(const struct map_lookup *map)
-{
- if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- return 1;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
- else
- return 0;
-}
+#include <linux/workqueue.h>
+#include "volumes.h"
+
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
+};
+
+struct btrfs_raid_bio {
+ struct btrfs_io_context *bioc;
+
+ /*
+ * While we're doing RMW on a stripe we put it into a hash table so we
+ * can lock the stripe and merge more rbios into it.
+ */
+ struct list_head hash_list;
+
+ /* LRU list for the stripe cache */
+ struct list_head stripe_cache;
+
+ /* For scheduling work in the helper threads */
+ struct work_struct work;
+
+ /*
+ * bio_list and bio_list_lock are used to add more bios into the stripe
+ * in hopes of avoiding the full RMW
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /*
+ * Also protected by the bio_list_lock, the plug list is used by the
+ * plugging code to collect partial bios while plugged. The stripe
+ * locking code also uses it to hand off the stripe lock to the next
+ * pending IO.
+ */
+ struct list_head plug_list;
+
+ /* Flags that tell us if it is safe to merge with this bio. */
+ unsigned long flags;
+
+ /*
+ * Set if we're doing a parity rebuild for a read from higher up, which
+ * is handled differently from a parity rebuild as part of RMW.
+ */
+ enum btrfs_rbio_ops operation;
+
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
+
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
+
+ /*
+ * Size of all the bios in the bio_list. This helps us decide if the
+ * rbio maps to a full stripe or not.
+ */
+ int bio_list_bytes;
+
+ refcount_t refs;
+
+ atomic_t stripes_pending;
+
+ atomic_t error;
+
+ struct work_struct end_io_work;
+
+ /* Bitmap to record which horizontal stripe has data */
+ unsigned long dbitmap;
+
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
+ unsigned long finish_pbitmap;
+
+ /*
+ * These are two arrays of pointers. We allocate the rbio big enough
+ * to hold them both and setup their locations when the rbio is
+ * allocated.
+ */
+
+ /*
+ * Pointers to pages that we allocated for reading/writing stripes
+ * directly from the disk (including P/Q).
+ */
+ struct page **stripe_pages;
+
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
+
+ /*
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
+ */
+ struct sector_ptr *stripe_sectors;
+
+ /* Allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+};
+
+/*
+ * For trace event usage only. Records useful debug info for each bio submitted
+ * by RAID56 to each physical device.
+ *
+ * No matter signed or not, (-1) is always the one indicating we can not grab
+ * the proper stripe number.
+ */
+struct raid56_bio_trace_info {
+ u64 devid;
+
+ /* The offset inside the stripe. (<= STRIPE_LEN) */
+ u32 offset;
+
+ /*
+ * Stripe number.
+ * 0 is the first data stripe, and nr_data for P stripe,
+ * nr_data + 1 for Q stripe.
+ * >= real_stripes for
+ */
+ u8 stripe_nr;
+};
static inline int nr_data_stripes(const struct map_lookup *map)
{
- return map->num_stripes - nr_parity_stripes(map);
+ return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
+
#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)
#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
((x) == RAID6_Q_STRIPE))
-struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len);
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num);
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical);
+ unsigned int pgoff, u64 logical);
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 stripe_len,
- struct btrfs_device *scrub_dev,
- unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ struct btrfs_io_context *bioc,
+ struct btrfs_device *scrub_dev,
+ unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
- struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index a97dc74a4d3d..5c1a617eb25d 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -8,7 +8,7 @@
struct rcu_string {
struct rcu_head rcu;
- char str[0];
+ char str[];
};
static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
deleted file mode 100644
index 243a2e44526e..000000000000
--- a/fs/btrfs/reada.c
+++ /dev/null
@@ -1,1014 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2011 STRATO. All rights reserved.
- */
-
-#include <linux/sched.h>
-#include <linux/pagemap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include "ctree.h"
-#include "volumes.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "dev-replace.h"
-#include "block-group.h"
-
-#undef DEBUG
-
-/*
- * This is the implementation for the generic read ahead framework.
- *
- * To trigger a readahead, btrfs_reada_add must be called. It will start
- * a read ahead for the given range [start, end) on tree root. The returned
- * handle can either be used to wait on the readahead to finish
- * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
- *
- * The read ahead works as follows:
- * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
- * reada_start_machine will then search for extents to prefetch and trigger
- * some reads. When a read finishes for a node, all contained node/leaf
- * pointers that lie in the given range will also be enqueued. The reads will
- * be triggered in sequential order, thus giving a big win over a naive
- * enumeration. It will also make use of multi-device layouts. Each disk
- * will have its on read pointer and all disks will by utilized in parallel.
- * Also will no two disks read both sides of a mirror simultaneously, as this
- * would waste seeking capacity. Instead both disks will read different parts
- * of the filesystem.
- * Any number of readaheads can be started in parallel. The read order will be
- * determined globally, i.e. 2 parallel readaheads will normally finish faster
- * than the 2 started one after another.
- */
-
-#define MAX_IN_FLIGHT 6
-
-struct reada_extctl {
- struct list_head list;
- struct reada_control *rc;
- u64 generation;
-};
-
-struct reada_extent {
- u64 logical;
- struct btrfs_key top;
- struct list_head extctl;
- int refcnt;
- spinlock_t lock;
- struct reada_zone *zones[BTRFS_MAX_MIRRORS];
- int nzones;
- int scheduled;
-};
-
-struct reada_zone {
- u64 start;
- u64 end;
- u64 elems;
- struct list_head list;
- spinlock_t lock;
- int locked;
- struct btrfs_device *device;
- struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
- * self */
- int ndevs;
- struct kref refcnt;
-};
-
-struct reada_machine_work {
- struct btrfs_work work;
- struct btrfs_fs_info *fs_info;
-};
-
-static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
-static void reada_control_release(struct kref *kref);
-static void reada_zone_release(struct kref *kref);
-static void reada_start_machine(struct btrfs_fs_info *fs_info);
-static void __reada_start_machine(struct btrfs_fs_info *fs_info);
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 generation);
-
-/* recurses */
-/* in case of err, eb might be NULL */
-static void __readahead_hook(struct btrfs_fs_info *fs_info,
- struct reada_extent *re, struct extent_buffer *eb,
- int err)
-{
- int nritems;
- int i;
- u64 bytenr;
- u64 generation;
- struct list_head list;
-
- spin_lock(&re->lock);
- /*
- * just take the full list from the extent. afterwards we
- * don't need the lock anymore
- */
- list_replace_init(&re->extctl, &list);
- re->scheduled = 0;
- spin_unlock(&re->lock);
-
- /*
- * this is the error case, the extent buffer has not been
- * read correctly. We won't access anything from it and
- * just cleanup our data structures. Effectively this will
- * cut the branch below this node from read ahead.
- */
- if (err)
- goto cleanup;
-
- /*
- * FIXME: currently we just set nritems to 0 if this is a leaf,
- * effectively ignoring the content. In a next step we could
- * trigger more readahead depending from the content, e.g.
- * fetch the checksums for the extents in the leaf.
- */
- if (!btrfs_header_level(eb))
- goto cleanup;
-
- nritems = btrfs_header_nritems(eb);
- generation = btrfs_header_generation(eb);
- for (i = 0; i < nritems; i++) {
- struct reada_extctl *rec;
- u64 n_gen;
- struct btrfs_key key;
- struct btrfs_key next_key;
-
- btrfs_node_key_to_cpu(eb, &key, i);
- if (i + 1 < nritems)
- btrfs_node_key_to_cpu(eb, &next_key, i + 1);
- else
- next_key = re->top;
- bytenr = btrfs_node_blockptr(eb, i);
- n_gen = btrfs_node_ptr_generation(eb, i);
-
- list_for_each_entry(rec, &list, list) {
- struct reada_control *rc = rec->rc;
-
- /*
- * if the generation doesn't match, just ignore this
- * extctl. This will probably cut off a branch from
- * prefetch. Alternatively one could start a new (sub-)
- * prefetch for this branch, starting again from root.
- * FIXME: move the generation check out of this loop
- */
-#ifdef DEBUG
- if (rec->generation != generation) {
- btrfs_debug(fs_info,
- "generation mismatch for (%llu,%d,%llu) %llu != %llu",
- key.objectid, key.type, key.offset,
- rec->generation, generation);
- }
-#endif
- if (rec->generation == generation &&
- btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
- btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
- reada_add_block(rc, bytenr, &next_key, n_gen);
- }
- }
-
-cleanup:
- /*
- * free extctl records
- */
- while (!list_empty(&list)) {
- struct reada_control *rc;
- struct reada_extctl *rec;
-
- rec = list_first_entry(&list, struct reada_extctl, list);
- list_del(&rec->list);
- rc = rec->rc;
- kfree(rec);
-
- kref_get(&rc->refcnt);
- if (atomic_dec_and_test(&rc->elems)) {
- kref_put(&rc->refcnt, reada_control_release);
- wake_up(&rc->wait);
- }
- kref_put(&rc->refcnt, reada_control_release);
-
- reada_extent_put(fs_info, re); /* one ref for each entry */
- }
-
- return;
-}
-
-int btree_readahead_hook(struct extent_buffer *eb, int err)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int ret = 0;
- struct reada_extent *re;
-
- /* find extent */
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree,
- eb->start >> PAGE_SHIFT);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
- if (!re) {
- ret = -1;
- goto start_machine;
- }
-
- __readahead_hook(fs_info, re, eb, err);
- reada_extent_put(fs_info, re); /* our ref */
-
-start_machine:
- reada_start_machine(fs_info);
- return ret;
-}
-
-static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
- struct btrfs_bio *bbio)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
- int ret;
- struct reada_zone *zone;
- struct btrfs_block_group *cache = NULL;
- u64 start;
- u64 end;
- int i;
-
- zone = NULL;
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_SHIFT, 1);
- if (ret == 1 && logical >= zone->start && logical <= zone->end) {
- kref_get(&zone->refcnt);
- spin_unlock(&fs_info->reada_lock);
- return zone;
- }
-
- spin_unlock(&fs_info->reada_lock);
-
- cache = btrfs_lookup_block_group(fs_info, logical);
- if (!cache)
- return NULL;
-
- start = cache->start;
- end = start + cache->length - 1;
- btrfs_put_block_group(cache);
-
- zone = kzalloc(sizeof(*zone), GFP_KERNEL);
- if (!zone)
- return NULL;
-
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret) {
- kfree(zone);
- return NULL;
- }
-
- zone->start = start;
- zone->end = end;
- INIT_LIST_HEAD(&zone->list);
- spin_lock_init(&zone->lock);
- zone->locked = 0;
- kref_init(&zone->refcnt);
- zone->elems = 0;
- zone->device = dev; /* our device always sits at index 0 */
- for (i = 0; i < bbio->num_stripes; ++i) {
- /* bounds have already been checked */
- zone->devs[i] = bbio->stripes[i].dev;
- }
- zone->ndevs = bbio->num_stripes;
-
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)(zone->end >> PAGE_SHIFT),
- zone);
-
- if (ret == -EEXIST) {
- kfree(zone);
- ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_SHIFT, 1);
- if (ret == 1 && logical >= zone->start && logical <= zone->end)
- kref_get(&zone->refcnt);
- else
- zone = NULL;
- }
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
-
- return zone;
-}
-
-static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
- u64 logical,
- struct btrfs_key *top)
-{
- int ret;
- struct reada_extent *re = NULL;
- struct reada_extent *re_exist = NULL;
- struct btrfs_bio *bbio = NULL;
- struct btrfs_device *dev;
- struct btrfs_device *prev_dev;
- u64 length;
- int real_stripes;
- int nzones = 0;
- unsigned long index = logical >> PAGE_SHIFT;
- int dev_replace_is_ongoing;
- int have_zone = 0;
-
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree, index);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
-
- if (re)
- return re;
-
- re = kzalloc(sizeof(*re), GFP_KERNEL);
- if (!re)
- return NULL;
-
- re->logical = logical;
- re->top = *top;
- INIT_LIST_HEAD(&re->extctl);
- spin_lock_init(&re->lock);
- re->refcnt = 1;
-
- /*
- * map block
- */
- length = fs_info->nodesize;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0);
- if (ret || !bbio || length < fs_info->nodesize)
- goto error;
-
- if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
- btrfs_err(fs_info,
- "readahead: more than %d copies not supported",
- BTRFS_MAX_MIRRORS);
- goto error;
- }
-
- real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
- for (nzones = 0; nzones < real_stripes; ++nzones) {
- struct reada_zone *zone;
-
- dev = bbio->stripes[nzones].dev;
-
- /* cannot read ahead on missing device. */
- if (!dev->bdev)
- continue;
-
- zone = reada_find_zone(dev, logical, bbio);
- if (!zone)
- continue;
-
- re->zones[re->nzones++] = zone;
- spin_lock(&zone->lock);
- if (!zone->elems)
- kref_get(&zone->refcnt);
- ++zone->elems;
- spin_unlock(&zone->lock);
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
- if (re->nzones == 0) {
- /* not a single zone found, error and out */
- goto error;
- }
-
- /* Insert extent in reada tree + all per-device trees, all or nothing */
- down_read(&fs_info->dev_replace.rwsem);
- ret = radix_tree_preload(GFP_KERNEL);
- if (ret) {
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
-
- spin_lock(&fs_info->reada_lock);
- ret = radix_tree_insert(&fs_info->reada_tree, index, re);
- if (ret == -EEXIST) {
- re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
- re_exist->refcnt++;
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- if (ret) {
- spin_unlock(&fs_info->reada_lock);
- radix_tree_preload_end();
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- radix_tree_preload_end();
- prev_dev = NULL;
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
- &fs_info->dev_replace);
- for (nzones = 0; nzones < re->nzones; ++nzones) {
- dev = re->zones[nzones]->device;
-
- if (dev == prev_dev) {
- /*
- * in case of DUP, just add the first zone. As both
- * are on the same device, there's nothing to gain
- * from adding both.
- * Also, it wouldn't work, as the tree is per device
- * and adding would fail with EEXIST
- */
- continue;
- }
- if (!dev->bdev)
- continue;
-
- if (dev_replace_is_ongoing &&
- dev == fs_info->dev_replace.tgtdev) {
- /*
- * as this device is selected for reading only as
- * a last resort, skip it for read ahead.
- */
- continue;
- }
- prev_dev = dev;
- ret = radix_tree_insert(&dev->reada_extents, index, re);
- if (ret) {
- while (--nzones >= 0) {
- dev = re->zones[nzones]->device;
- BUG_ON(dev == NULL);
- /* ignore whether the entry was inserted */
- radix_tree_delete(&dev->reada_extents, index);
- }
- radix_tree_delete(&fs_info->reada_tree, index);
- spin_unlock(&fs_info->reada_lock);
- up_read(&fs_info->dev_replace.rwsem);
- goto error;
- }
- have_zone = 1;
- }
- spin_unlock(&fs_info->reada_lock);
- up_read(&fs_info->dev_replace.rwsem);
-
- if (!have_zone)
- goto error;
-
- btrfs_put_bbio(bbio);
- return re;
-
-error:
- for (nzones = 0; nzones < re->nzones; ++nzones) {
- struct reada_zone *zone;
-
- zone = re->zones[nzones];
- kref_get(&zone->refcnt);
- spin_lock(&zone->lock);
- --zone->elems;
- if (zone->elems == 0) {
- /*
- * no fs_info->reada_lock needed, as this can't be
- * the last ref
- */
- kref_put(&zone->refcnt, reada_zone_release);
- }
- spin_unlock(&zone->lock);
-
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
- btrfs_put_bbio(bbio);
- kfree(re);
- return re_exist;
-}
-
-static void reada_extent_put(struct btrfs_fs_info *fs_info,
- struct reada_extent *re)
-{
- int i;
- unsigned long index = re->logical >> PAGE_SHIFT;
-
- spin_lock(&fs_info->reada_lock);
- if (--re->refcnt) {
- spin_unlock(&fs_info->reada_lock);
- return;
- }
-
- radix_tree_delete(&fs_info->reada_tree, index);
- for (i = 0; i < re->nzones; ++i) {
- struct reada_zone *zone = re->zones[i];
-
- radix_tree_delete(&zone->device->reada_extents, index);
- }
-
- spin_unlock(&fs_info->reada_lock);
-
- for (i = 0; i < re->nzones; ++i) {
- struct reada_zone *zone = re->zones[i];
-
- kref_get(&zone->refcnt);
- spin_lock(&zone->lock);
- --zone->elems;
- if (zone->elems == 0) {
- /* no fs_info->reada_lock needed, as this can't be
- * the last ref */
- kref_put(&zone->refcnt, reada_zone_release);
- }
- spin_unlock(&zone->lock);
-
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
- spin_unlock(&fs_info->reada_lock);
- }
-
- kfree(re);
-}
-
-static void reada_zone_release(struct kref *kref)
-{
- struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
-
- radix_tree_delete(&zone->device->reada_zones,
- zone->end >> PAGE_SHIFT);
-
- kfree(zone);
-}
-
-static void reada_control_release(struct kref *kref)
-{
- struct reada_control *rc = container_of(kref, struct reada_control,
- refcnt);
-
- kfree(rc);
-}
-
-static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 generation)
-{
- struct btrfs_fs_info *fs_info = rc->fs_info;
- struct reada_extent *re;
- struct reada_extctl *rec;
-
- /* takes one ref */
- re = reada_find_extent(fs_info, logical, top);
- if (!re)
- return -1;
-
- rec = kzalloc(sizeof(*rec), GFP_KERNEL);
- if (!rec) {
- reada_extent_put(fs_info, re);
- return -ENOMEM;
- }
-
- rec->rc = rc;
- rec->generation = generation;
- atomic_inc(&rc->elems);
-
- spin_lock(&re->lock);
- list_add_tail(&rec->list, &re->extctl);
- spin_unlock(&re->lock);
-
- /* leave the ref on the extent */
-
- return 0;
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
-{
- int i;
- unsigned long index = zone->end >> PAGE_SHIFT;
-
- for (i = 0; i < zone->ndevs; ++i) {
- struct reada_zone *peer;
- peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
- if (peer && peer->device != zone->device)
- peer->locked = lock;
- }
-}
-
-/*
- * called with fs_info->reada_lock held
- */
-static int reada_pick_zone(struct btrfs_device *dev)
-{
- struct reada_zone *top_zone = NULL;
- struct reada_zone *top_locked_zone = NULL;
- u64 top_elems = 0;
- u64 top_locked_elems = 0;
- unsigned long index = 0;
- int ret;
-
- if (dev->reada_curr_zone) {
- reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
- kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
- dev->reada_curr_zone = NULL;
- }
- /* pick the zone with the most elements */
- while (1) {
- struct reada_zone *zone;
-
- ret = radix_tree_gang_lookup(&dev->reada_zones,
- (void **)&zone, index, 1);
- if (ret == 0)
- break;
- index = (zone->end >> PAGE_SHIFT) + 1;
- if (zone->locked) {
- if (zone->elems > top_locked_elems) {
- top_locked_elems = zone->elems;
- top_locked_zone = zone;
- }
- } else {
- if (zone->elems > top_elems) {
- top_elems = zone->elems;
- top_zone = zone;
- }
- }
- }
- if (top_zone)
- dev->reada_curr_zone = top_zone;
- else if (top_locked_zone)
- dev->reada_curr_zone = top_locked_zone;
- else
- return 0;
-
- dev->reada_next = dev->reada_curr_zone->start;
- kref_get(&dev->reada_curr_zone->refcnt);
- reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
-
- return 1;
-}
-
-static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- int mirror_num, struct extent_buffer **eb)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(buf))
- return 0;
-
- set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
-
- ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
- if (ret) {
- free_extent_buffer_stale(buf);
- return ret;
- }
-
- if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
- free_extent_buffer_stale(buf);
- return -EIO;
- } else if (extent_buffer_uptodate(buf)) {
- *eb = buf;
- } else {
- free_extent_buffer(buf);
- }
- return 0;
-}
-
-static int reada_start_machine_dev(struct btrfs_device *dev)
-{
- struct btrfs_fs_info *fs_info = dev->fs_info;
- struct reada_extent *re = NULL;
- int mirror_num = 0;
- struct extent_buffer *eb = NULL;
- u64 logical;
- int ret;
- int i;
-
- spin_lock(&fs_info->reada_lock);
- if (dev->reada_curr_zone == NULL) {
- ret = reada_pick_zone(dev);
- if (!ret) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- }
- /*
- * FIXME currently we issue the reads one extent at a time. If we have
- * a contiguous block of extents, we could also coagulate them or use
- * plugging to speed things up
- */
- ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_SHIFT, 1);
- if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
- ret = reada_pick_zone(dev);
- if (!ret) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- re = NULL;
- ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_SHIFT, 1);
- }
- if (ret == 0) {
- spin_unlock(&fs_info->reada_lock);
- return 0;
- }
- dev->reada_next = re->logical + fs_info->nodesize;
- re->refcnt++;
-
- spin_unlock(&fs_info->reada_lock);
-
- spin_lock(&re->lock);
- if (re->scheduled || list_empty(&re->extctl)) {
- spin_unlock(&re->lock);
- reada_extent_put(fs_info, re);
- return 0;
- }
- re->scheduled = 1;
- spin_unlock(&re->lock);
-
- /*
- * find mirror num
- */
- for (i = 0; i < re->nzones; ++i) {
- if (re->zones[i]->device == dev) {
- mirror_num = i + 1;
- break;
- }
- }
- logical = re->logical;
-
- atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
- if (ret)
- __readahead_hook(fs_info, re, NULL, ret);
- else if (eb)
- __readahead_hook(fs_info, re, eb, ret);
-
- if (eb)
- free_extent_buffer(eb);
-
- atomic_dec(&dev->reada_in_flight);
- reada_extent_put(fs_info, re);
-
- return 1;
-
-}
-
-static void reada_start_machine_worker(struct btrfs_work *work)
-{
- struct reada_machine_work *rmw;
- int old_ioprio;
-
- rmw = container_of(work, struct reada_machine_work, work);
-
- old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
- task_nice_ioprio(current));
- set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(rmw->fs_info);
- set_task_ioprio(current, old_ioprio);
-
- atomic_dec(&rmw->fs_info->reada_works_cnt);
-
- kfree(rmw);
-}
-
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- u64 enqueued;
- u64 total = 0;
- int i;
-
-again:
- do {
- enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (atomic_read(&device->reada_in_flight) <
- MAX_IN_FLIGHT)
- enqueued += reada_start_machine_dev(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
- total += enqueued;
- } while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
-
- if (enqueued == 0)
- return;
-
- /*
- * If everything is already in the cache, this is effectively single
- * threaded. To a) not hold the caller for too long and b) to utilize
- * more cores, we broke the loop above after 10000 iterations and now
- * enqueue to workers to finish it. This will distribute the load to
- * the cores.
- */
- for (i = 0; i < 2; ++i) {
- reada_start_machine(fs_info);
- if (atomic_read(&fs_info->reada_works_cnt) >
- BTRFS_MAX_MIRRORS * 2)
- break;
- }
-}
-
-static void reada_start_machine(struct btrfs_fs_info *fs_info)
-{
- struct reada_machine_work *rmw;
-
- rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
- if (!rmw) {
- /* FIXME we cannot handle this properly right now */
- BUG();
- }
- btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
- rmw->fs_info = fs_info;
-
- btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
- atomic_inc(&fs_info->reada_works_cnt);
-}
-
-#ifdef DEBUG
-static void dump_devs(struct btrfs_fs_info *fs_info, int all)
-{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- unsigned long index;
- int ret;
- int i;
- int j;
- int cnt;
-
- spin_lock(&fs_info->reada_lock);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
- atomic_read(&device->reada_in_flight));
- index = 0;
- while (1) {
- struct reada_zone *zone;
- ret = radix_tree_gang_lookup(&device->reada_zones,
- (void **)&zone, index, 1);
- if (ret == 0)
- break;
- pr_debug(" zone %llu-%llu elems %llu locked %d devs",
- zone->start, zone->end, zone->elems,
- zone->locked);
- for (j = 0; j < zone->ndevs; ++j) {
- pr_cont(" %lld",
- zone->devs[j]->devid);
- }
- if (device->reada_curr_zone == zone)
- pr_cont(" curr off %llu",
- device->reada_next - zone->start);
- pr_cont("\n");
- index = (zone->end >> PAGE_SHIFT) + 1;
- }
- cnt = 0;
- index = 0;
- while (all) {
- struct reada_extent *re = NULL;
-
- ret = radix_tree_gang_lookup(&device->reada_extents,
- (void **)&re, index, 1);
- if (ret == 0)
- break;
- pr_debug(" re: logical %llu size %u empty %d scheduled %d",
- re->logical, fs_info->nodesize,
- list_empty(&re->extctl), re->scheduled);
-
- for (i = 0; i < re->nzones; ++i) {
- pr_cont(" zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- pr_cont(" %lld",
- re->zones[i]->devs[j]->devid);
- }
- }
- pr_cont("\n");
- index = (re->logical >> PAGE_SHIFT) + 1;
- if (++cnt > 15)
- break;
- }
- }
-
- index = 0;
- cnt = 0;
- while (all) {
- struct reada_extent *re = NULL;
-
- ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
- index, 1);
- if (ret == 0)
- break;
- if (!re->scheduled) {
- index = (re->logical >> PAGE_SHIFT) + 1;
- continue;
- }
- pr_debug("re: logical %llu size %u list empty %d scheduled %d",
- re->logical, fs_info->nodesize,
- list_empty(&re->extctl), re->scheduled);
- for (i = 0; i < re->nzones; ++i) {
- pr_cont(" zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- pr_cont(" %lld",
- re->zones[i]->devs[j]->devid);
- }
- }
- pr_cont("\n");
- index = (re->logical >> PAGE_SHIFT) + 1;
- }
- spin_unlock(&fs_info->reada_lock);
-}
-#endif
-
-/*
- * interface
- */
-struct reada_control *btrfs_reada_add(struct btrfs_root *root,
- struct btrfs_key *key_start, struct btrfs_key *key_end)
-{
- struct reada_control *rc;
- u64 start;
- u64 generation;
- int ret;
- struct extent_buffer *node;
- static struct btrfs_key max_key = {
- .objectid = (u64)-1,
- .type = (u8)-1,
- .offset = (u64)-1
- };
-
- rc = kzalloc(sizeof(*rc), GFP_KERNEL);
- if (!rc)
- return ERR_PTR(-ENOMEM);
-
- rc->fs_info = root->fs_info;
- rc->key_start = *key_start;
- rc->key_end = *key_end;
- atomic_set(&rc->elems, 0);
- init_waitqueue_head(&rc->wait);
- kref_init(&rc->refcnt);
- kref_get(&rc->refcnt); /* one ref for having elements */
-
- node = btrfs_root_node(root);
- start = node->start;
- generation = btrfs_header_generation(node);
- free_extent_buffer(node);
-
- ret = reada_add_block(rc, start, &max_key, generation);
- if (ret) {
- kfree(rc);
- return ERR_PTR(ret);
- }
-
- reada_start_machine(root->fs_info);
-
- return rc;
-}
-
-#ifdef DEBUG
-int btrfs_reada_wait(void *handle)
-{
- struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->fs_info;
-
- while (atomic_read(&rc->elems)) {
- if (!atomic_read(&fs_info->reada_works_cnt))
- reada_start_machine(fs_info);
- wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
- 5 * HZ);
- dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
- }
-
- dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
-
- kref_put(&rc->refcnt, reada_control_release);
-
- return 0;
-}
-#else
-int btrfs_reada_wait(void *handle)
-{
- struct reada_control *rc = handle;
- struct btrfs_fs_info *fs_info = rc->fs_info;
-
- while (atomic_read(&rc->elems)) {
- if (!atomic_read(&fs_info->reada_works_cnt))
- reada_start_machine(fs_info);
- wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
- (HZ + 9) / 10);
- }
-
- kref_put(&rc->refcnt, reada_control_release);
-
- return 0;
-}
-#endif
-
-void btrfs_reada_detach(void *handle)
-{
- struct reada_control *rc = handle;
-
- kref_put(&rc->refcnt, reada_control_release);
-}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 454a1015d026..a248f46cfe72 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -218,11 +218,11 @@ static void __print_stack_trace(struct btrfs_fs_info *fs_info,
stack_trace_print(ra->trace, ra->trace_len, 2);
}
#else
-static void inline __save_stack_trace(struct ref_action *ra)
+static inline void __save_stack_trace(struct ref_action *ra)
{
}
-static void inline __print_stack_trace(struct btrfs_fs_info *fs_info,
+static inline void __print_stack_trace(struct btrfs_fs_info *fs_info,
struct ref_action *ra)
{
btrfs_err(fs_info, " ref-verify: no stacktrace support");
@@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
struct block_entry *be = NULL, *exist;
struct root_entry *re = NULL;
- re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
- be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+ re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+ be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
if (!be || !re) {
kfree(re);
kfree(be);
@@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
exist_re = insert_root_entry(&exist->roots, re);
if (exist_re)
kfree(re);
+ } else {
+ kfree(re);
}
kfree(be);
return exist;
@@ -311,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
struct root_entry *re;
struct ref_entry *ref = NULL, *exist;
- ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -356,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
struct block_entry *be;
struct ref_entry *ref;
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, 0);
@@ -391,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
@@ -433,7 +435,7 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
struct extent_buffer *leaf = path->nodes[0];
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
int type, ret;
@@ -493,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
}
static int process_leaf(struct btrfs_root *root,
- struct btrfs_path *path, u64 *bytenr, u64 *num_bytes)
+ struct btrfs_path *path, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret = 0;
+ int i = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
@@ -509,19 +512,19 @@ static int process_leaf(struct btrfs_root *root,
switch (key.type) {
case BTRFS_EXTENT_ITEM_KEY:
*num_bytes = key.offset;
- /* fall through */
+ fallthrough;
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
- &tree_block_level);
+ tree_block_level);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
ret = add_tree_block(fs_info, key.offset, 0,
- key.objectid, tree_block_level);
+ key.objectid, *tree_block_level);
break;
case BTRFS_SHARED_BLOCK_REF_KEY:
ret = add_tree_block(fs_info, 0, key.offset,
- key.objectid, tree_block_level);
+ key.objectid, *tree_block_level);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = btrfs_item_ptr(leaf, i,
@@ -547,38 +550,25 @@ static int process_leaf(struct btrfs_root *root,
/* Walk down to the leaf from the given level */
static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
- int level, u64 *bytenr, u64 *num_bytes)
+ int level, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb;
- u64 block_bytenr, gen;
int ret = 0;
while (level >= 0) {
if (level) {
- struct btrfs_key first_key;
-
- block_bytenr = btrfs_node_blockptr(path->nodes[level],
- path->slots[level]);
- gen = btrfs_node_ptr_generation(path->nodes[level],
- path->slots[level]);
- btrfs_node_key_to_cpu(path->nodes[level], &first_key,
- path->slots[level]);
- eb = read_tree_block(fs_info, block_bytenr, gen,
- level - 1, &first_key);
+ eb = btrfs_read_node_slot(path->nodes[level],
+ path->slots[level]);
if (IS_ERR(eb))
return PTR_ERR(eb);
- if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- return -EIO;
- }
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
- path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level-1] = BTRFS_READ_LOCK;
} else {
- ret = process_leaf(root, path, bytenr, num_bytes);
+ ret = process_leaf(root, path, bytenr, num_bytes,
+ tree_block_level);
if (ret)
break;
}
@@ -679,19 +669,19 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root;
- u64 owner;
- u64 offset;
+ u64 ref_root = 0;
+ u64 owner = 0;
+ u64 offset = 0;
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
if (generic_ref->type == BTRFS_REF_METADATA) {
- ref_root = generic_ref->tree_ref.root;
+ if (!parent)
+ ref_root = generic_ref->tree_ref.owning_root;
owner = generic_ref->tree_ref.level;
- offset = 0;
- } else {
- ref_root = generic_ref->data_ref.ref_root;
+ } else if (!parent) {
+ ref_root = generic_ref->data_ref.owning_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
@@ -706,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
goto out;
}
- if (parent) {
- ref->parent = parent;
- } else {
- ref->root_objectid = ref_root;
- ref->owner = owner;
- ref->offset = offset;
- }
+ ref->parent = parent;
+ ref->owner = owner;
+ ref->root_objectid = ref_root;
+ ref->offset = offset;
ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
memcpy(&ra->ref, ref, sizeof(struct ref_entry));
@@ -797,8 +784,16 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (!be) {
btrfs_err(fs_info,
"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
- action, (unsigned long long)bytenr,
- (unsigned long long)num_bytes);
+ action, bytenr, num_bytes);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
+ } else if (be->num_refs == 0) {
+ btrfs_err(fs_info,
+ "trying to do action %d for a bytenr that has 0 total references",
+ action);
+ dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
@@ -849,6 +844,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
@@ -976,8 +972,10 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
/* Walk down all roots and build the ref tree, meant to be called at mount */
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *eb;
+ int tree_block_level = 0;
u64 bytenr = 0, num_bytes = 0;
int ret, level;
@@ -988,12 +986,12 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_read(eb);
+ extent_root = btrfs_extent_root(fs_info, 0);
+ eb = btrfs_read_lock_root_node(extent_root);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_READ_LOCK;
while (1) {
/*
@@ -1002,8 +1000,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
* would have had to added a ref key item which may appear on a
* different leaf from the original extent item.
*/
- ret = walk_down_tree(fs_info->extent_root, path, level,
- &bytenr, &num_bytes);
+ ret = walk_down_tree(extent_root, path, level,
+ &bytenr, &num_bytes, &tree_block_level);
if (ret)
break;
ret = walk_up_tree(path, &level);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
new file mode 100644
index 000000000000..f50586ff85c8
--- /dev/null
+++ b/fs/btrfs/reflink.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blkdev.h>
+#include <linux/iversion.h>
+#include "compression.h"
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "disk-io.h"
+#include "reflink.h"
+#include "transaction.h"
+#include "subpage.h"
+
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen,
+ int no_time_update)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ if (!no_time_update) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size) {
+ i_size_write(inode, endoff);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ }
+
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans);
+out:
+ return ret;
+}
+
+static int copy_inline_to_page(struct btrfs_inode *inode,
+ const u64 file_offset,
+ char *inline_data,
+ const u64 size,
+ const u64 datal,
+ const u8 comp_type)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 block_size = fs_info->sectorsize;
+ const u64 range_end = file_offset + block_size - 1;
+ const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
+ char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
+ struct extent_changeset *data_reserved = NULL;
+ struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ int ret;
+
+ ASSERT(IS_ALIGNED(file_offset, block_size));
+
+ /*
+ * We have flushed and locked the ranges of the source and destination
+ * inodes, we also have locked the inodes, so we are safe to do a
+ * reservation here. Also we must not do the reservation while holding
+ * a transaction open, otherwise we would deadlock.
+ */
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
+ if (ret)
+ goto out;
+
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * After dirtying the page our caller will need to start a transaction,
+ * and if we are low on metadata free space, that can cause flushing of
+ * delalloc for all inodes in order to get metadata space released.
+ * However we are holding the range locked for the whole duration of
+ * the clone/dedupe operation, so we may deadlock if that happens and no
+ * other task releases enough space. So mark this inode as not being
+ * possible to flush to avoid such deadlock. We will clear that flag
+ * when we finish cloning all extents, since a transaction is started
+ * after finding each extent to clone.
+ */
+ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
+
+ if (comp_type == BTRFS_COMPRESS_NONE) {
+ memcpy_to_page(page, offset_in_page(file_offset), data_start,
+ datal);
+ } else {
+ ret = btrfs_decompress(comp_type, data_start, page,
+ offset_in_page(file_offset),
+ inline_size, datal);
+ if (ret)
+ goto out_unlock;
+ flush_dcache_page(page);
+ }
+
+ /*
+ * If our inline data is smaller then the block/page size, then the
+ * remaining of the block/page is equivalent to zeroes. We had something
+ * like the following done:
+ *
+ * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
+ * $ sync # (or fsync)
+ * $ xfs_io -c "falloc 0 4K" file
+ * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
+ *
+ * So what's in the range [500, 4095] corresponds to zeroes.
+ */
+ if (datal < block_size)
+ memzero_page(page, datal, block_size - datal);
+
+ btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
+ btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
+ btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+out_unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
+out:
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+/*
+ * Deal with cloning of inline extents. We try to copy the inline extent from
+ * the source inode to destination inode when possible. When not possible we
+ * copy the inline extent's data into the respective page of the inode.
+ */
+static int clone_copy_inline_extent(struct inode *dst,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 size,
+ const u8 comp_type,
+ char *inline_data,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ fs_info->sectorsize);
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0) {
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
+ }
+
+ key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ /*
+ * There's an implicit hole at file offset 0, copy the
+ * inline extent's data to the page.
+ */
+ ASSERT(key.offset > 0);
+ goto copy_to_page;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent replace it with the source inline
+ * extent, otherwise copy the source inline extent data into
+ * the respective page at the destination inode.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ goto copy_to_page;
+ }
+
+copy_inline_extent:
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * At the destination offset 0 we have either a hole, a regular
+ * extent or an inline extent larger then the one we want to
+ * clone. Deal with all these cases by copying the inline extent
+ * data into the respective page at the destination inode.
+ */
+ goto copy_to_page;
+ }
+
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
+ btrfs_release_path(path);
+ /*
+ * If we end up here it means were copy the inline extent into a leaf
+ * of the destination inode. We know we will drop or adjust at most one
+ * extent item in the destination root.
+ *
+ * 1 unit - adjusting old extent (we may have to split it)
+ * 1 unit - add new extent
+ * 1 unit - inode update
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ drop_args.path = path;
+ drop_args.start = drop_start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
+ if (ret)
+ goto out;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ goto out;
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(BTRFS_I(dst));
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+out:
+ if (!ret && !trans) {
+ /*
+ * No transaction here means we copied the inline extent into a
+ * page of the destination inode.
+ *
+ * 1 unit to update inode item
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+ if (ret && trans) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+ if (!ret)
+ *trans_out = trans;
+
+ return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff, int no_time_update)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans;
+ char *buf = NULL;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ const u64 len = olen_aligned;
+ u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
+
+ ret = -ENOMEM;
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ kvfree(buf);
+ return ret;
+ }
+
+ path->reada = READA_FORWARD;
+ /* Clone data */
+ key.objectid = btrfs_ino(BTRFS_I(src));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = off;
+
+ while (1) {
+ struct btrfs_file_extent_item *extent;
+ u64 extent_gen;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+ u64 drop_start;
+
+ /* Note the key will change type as we walk through the tree */
+ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != btrfs_ino(BTRFS_I(src)))
+ break;
+
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ extent_gen = btrfs_file_extent_generation(leaf, extent);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf, extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf, extent);
+ }
+
+ /*
+ * The first search might have left us at an extent item that
+ * ends before our target range's start, can happen if we have
+ * holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
+ */
+ if (key.offset + datal <= prev_extent_end) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+
+ prev_extent_end = key.offset + datal;
+ size = btrfs_item_size(leaf, slot);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = btrfs_ino(BTRFS_I(inode));
+ if (off <= key.offset)
+ new_key.offset = key.offset + destoff - off;
+ else
+ new_key.offset = destoff;
+
+ /*
+ * Deal with a hole that doesn't have an extent item that
+ * represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning range or at
+ * the beginning (fully overlaps it or partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct btrfs_replace_extent_info clone_info;
+
+ /*
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
+ */
+
+ /* Subtract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* Subtract range a */
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ clone_info.disk_offset = disko;
+ clone_info.disk_len = diskl;
+ clone_info.data_offset = datao;
+ clone_info.data_len = datal;
+ clone_info.file_offset = new_key.offset;
+ clone_info.extent_buf = buf;
+ clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ drop_start, new_key.offset + datal - 1,
+ &clone_info, &trans);
+ if (ret)
+ goto out;
+ } else {
+ ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
+ /*
+ * Inline extents always have to start at file offset 0
+ * and can never be bigger then the sector size. We can
+ * never clone only parts of an inline extent, since all
+ * reflink operations must start at a sector size aligned
+ * offset, and the length must be aligned too or end at
+ * the i_size (which implies the whole inlined data).
+ */
+ ASSERT(key.offset == 0);
+ ASSERT(datal <= fs_info->sectorsize);
+ if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
+ WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ ret = clone_copy_inline_extent(inode, path, &new_key,
+ drop_start, datal, size,
+ comp, buf, &trans);
+ if (ret)
+ goto out;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * Whenever we share an extent we update the last_reflink_trans
+ * of each inode to the current transaction. This is needed to
+ * make sure fsync does not log multiple checksum items with
+ * overlapping ranges (because some extent items might refer
+ * only to sections of the original extent). For the destination
+ * inode we do this regardless of the generation of the extents
+ * or even if they are inline extents or explicit holes, to make
+ * sure a full fsync does not skip them. For the source inode,
+ * we only need to update last_reflink_trans in case it's a new
+ * extent that is not a hole or an inline extent, to deal with
+ * the checksums problem on fsync.
+ */
+ if (extent_gen == trans->transid && disko > 0)
+ BTRFS_I(src)->last_reflink_trans = trans->transid;
+
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
+
+ last_dest_end = ALIGN(new_key.offset + datal,
+ fs_info->sectorsize);
+ ret = clone_finish_inode_update(trans, inode, last_dest_end,
+ destoff, olen, no_time_update);
+ if (ret)
+ goto out;
+ if (new_key.offset + datal >= destoff + len)
+ break;
+
+ btrfs_release_path(path);
+ key.offset = prev_extent_end;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
+ }
+ ret = 0;
+
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole that fully or partially overlaps our
+ * cloning range at its end. This means that we either have the
+ * NO_HOLES feature enabled or the implicit hole happened due to
+ * mixing buffered and direct IO writes against this file.
+ */
+ btrfs_release_path(path);
+
+ /*
+ * When using NO_HOLES and we are cloning a range that covers
+ * only a hole (no extents) into a range beyond the current
+ * i_size, punching a hole in the target range will not create
+ * an extent map defining a hole, because the range starts at or
+ * beyond current i_size. If the file previously had an i_size
+ * greater than the new i_size set by this clone operation, we
+ * need to make sure the next fsync is a full fsync, so that it
+ * detects and logs a hole covering a range from the current
+ * i_size to the new i_size. If the clone range covers extents,
+ * besides a hole, then we know the full sync flag was already
+ * set by previous calls to btrfs_replace_file_extents() that
+ * replaced file extent items.
+ */
+ if (last_dest_end >= i_size_read(inode))
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
+
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ last_dest_end, destoff + len - 1, NULL, &trans);
+ if (ret)
+ goto out;
+
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen, no_time_update);
+ }
+
+out:
+ btrfs_free_path(path);
+ kvfree(buf);
+ clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
+ return ret;
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ u64 range1_end = loff1 + len - 1;
+ u64 range2_end = loff2 + len - 1;
+
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ swap(range1_end, range2_end);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
+ swap(range1_end, range2_end);
+ }
+
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
+ btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
+}
+
+static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+ down_write(&BTRFS_I(inode1)->i_mmap_lock);
+ down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+}
+
+static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+{
+ up_write(&BTRFS_I(inode1)->i_mmap_lock);
+ up_write(&BTRFS_I(inode2)->i_mmap_lock);
+}
+
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ const u64 bs = fs_info->sb->s_blocksize;
+ int ret;
+
+ /*
+ * Lock destination range to serialize with concurrent readahead() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+ btrfs_btree_balance_dirty(fs_info);
+
+ return ret;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret = 0;
+ u64 i, tail_len, chunk_count;
+ struct btrfs_root *root_dst = BTRFS_I(dst)->root;
+
+ spin_lock(&root_dst->root_item_lock);
+ if (root_dst->send_in_progress) {
+ btrfs_warn_rl(root_dst->fs_info,
+"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
+ root_dst->root_key.objectid,
+ root_dst->send_in_progress);
+ spin_unlock(&root_dst->root_item_lock);
+ return -EAGAIN;
+ }
+ root_dst->dedupe_in_progress++;
+ spin_unlock(&root_dst->root_item_lock);
+
+ tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+ chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+
+ for (i = 0; i < chunk_count; i++) {
+ ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+ dst, dst_loff);
+ if (ret)
+ goto out;
+
+ loff += BTRFS_MAX_DEDUPE_LEN;
+ dst_loff += BTRFS_MAX_DEDUPE_LEN;
+ }
+
+ if (tail_len > 0)
+ ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+out:
+ spin_lock(&root_dst->root_item_lock);
+ root_dst->dedupe_in_progress--;
+ spin_unlock(&root_dst->root_item_lock);
+
+ return ret;
+}
+
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret;
+ int wb_ret;
+ u64 len = olen;
+ u64 bs = fs_info->sb->s_blocksize;
+
+ /*
+ * VFS's generic_remap_file_range_prep() protects us from cloning the
+ * eof block into the middle of a file, which would result in corruption
+ * if the file size is not blocksize aligned. So we don't need to check
+ * for that case here.
+ */
+ if (off + len == src->i_size)
+ len = ALIGN(src->i_size, bs) - off;
+
+ if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
+ ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
+ if (ret)
+ return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Lock destination range to serialize with concurrent readahead() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
+
+ /*
+ * We may have copied an inline extent into a page of the destination
+ * range, so wait for writeback to complete before truncating pages
+ * from the page cache. This is a rare case.
+ */
+ wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ ret = ret ? ret : wb_ret;
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data,
+ round_down(destoff, PAGE_SIZE),
+ round_up(destoff + len, PAGE_SIZE) - 1);
+
+ btrfs_btree_balance_dirty(fs_info);
+
+ return ret;
+}
+
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ u64 wb_len;
+ int ret;
+
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+ if (btrfs_root_readonly(root_out))
+ return -EROFS;
+
+ ASSERT(inode_in->i_sb == inode_out->i_sb);
+ }
+
+ /* Don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ return -EINVAL;
+ }
+
+ /*
+ * Now that the inodes are locked, we need to start writeback ourselves
+ * and can not rely on the writeback from the VFS's generic helper
+ * generic_remap_file_range_prep() because:
+ *
+ * 1) For compression we must call filemap_fdatawrite_range() range
+ * twice (btrfs_fdatawrite_range() does it for us), and the generic
+ * helper only calls it once;
+ *
+ * 2) filemap_fdatawrite_range(), called by the generic helper only
+ * waits for the writeback to complete, i.e. for IO to be done, and
+ * not for the ordered extents to complete. We need to wait for them
+ * to complete so that new file extent items are in the fs tree.
+ */
+ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+ wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ else
+ wb_len = ALIGN(*len, bs);
+
+ /*
+ * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
+ *
+ * Btrfs' back references do not have a block level granularity, they
+ * work at the whole extent level.
+ * NOCOW buffered write without data space reserved may not be able
+ * to fall back to CoW due to lack of data space, thus could cause
+ * data loss.
+ *
+ * Here we take a shortcut by flushing the whole inode, so that all
+ * nocow write should reach disk as nocow before we increase the
+ * reference of the extent. We could do better by only flushing NOCOW
+ * data, but that needs extra accounting.
+ *
+ * Also we don't need to check ASYNC_EXTENT, as async extent will be
+ * CoWed anyway, not affecting nocow part.
+ */
+ ret = filemap_flush(inode_in->i_mapping);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+
+ return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+}
+
+static bool file_sync_write(const struct file *file)
+{
+ if (file->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(file)))
+ return true;
+
+ return false;
+}
+
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ bool same_inode = dst_inode == src_inode;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (same_inode) {
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ lock_two_nondirectories(src_inode, dst_inode);
+ btrfs_double_mmap_lock(src_inode, dst_inode);
+ }
+
+ ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ else
+ ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+
+out_unlock:
+ if (same_inode) {
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ btrfs_double_mmap_unlock(src_inode, dst_inode);
+ unlock_two_nondirectories(src_inode, dst_inode);
+ }
+
+ /*
+ * If either the source or the destination file was opened with O_SYNC,
+ * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
+ * source files/ranges, so that after a successful return (0) followed
+ * by a power failure results in the reflinked data to be readable from
+ * both files/ranges.
+ */
+ if (ret == 0 && len > 0 &&
+ (file_sync_write(src_file) || file_sync_write(dst_file))) {
+ ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
+ if (ret == 0)
+ ret = btrfs_sync_file(dst_file, destoff,
+ destoff + len - 1, 0);
+ }
+
+ return ret < 0 ? ret : len;
+}
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
new file mode 100644
index 000000000000..ecb309b4dad0
--- /dev/null
+++ b/fs/btrfs/reflink.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_REFLINK_H
+#define BTRFS_REFLINK_H
+
+#include <linux/fs.h>
+
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+
+#endif /* BTRFS_REFLINK_H */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 995d4b8b1cfd..666a37a0ee89 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -17,106 +18,72 @@
#include "btrfs_inode.h"
#include "async-thread.h"
#include "free-space-cache.h"
-#include "inode-map.h"
#include "qgroup.h"
#include "print-tree.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "backref.h"
+#include "misc.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "inode-item.h"
/*
- * backref_node, mapping_node and tree_block start with this
- */
-struct tree_entry {
- struct rb_node rb_node;
- u64 bytenr;
-};
-
-/*
- * present a tree block in the backref cache
- */
-struct backref_node {
- struct rb_node rb_node;
- u64 bytenr;
-
- u64 new_bytenr;
- /* objectid of tree block owner, can be not uptodate */
- u64 owner;
- /* link to pending, changed or detached list */
- struct list_head list;
- /* list of upper level blocks reference this block */
- struct list_head upper;
- /* list of child blocks in the cache */
- struct list_head lower;
- /* NULL if this node is not tree root */
- struct btrfs_root *root;
- /* extent buffer got by COW the block */
- struct extent_buffer *eb;
- /* level of tree block */
- unsigned int level:8;
- /* is the block in non-reference counted tree */
- unsigned int cowonly:1;
- /* 1 if no child node in the cache */
- unsigned int lowest:1;
- /* is the extent buffer locked */
- unsigned int locked:1;
- /* has the block been processed */
- unsigned int processed:1;
- /* have backrefs of this block been checked */
- unsigned int checked:1;
- /*
- * 1 if corresponding block has been cowed but some upper
- * level block pointers may not point to the new location
- */
- unsigned int pending:1;
- /*
- * 1 if the backref node isn't connected to any other
- * backref node.
- */
- unsigned int detached:1;
-};
-
-/*
- * present a block pointer in the backref cache
+ * Relocation overview
+ *
+ * [What does relocation do]
+ *
+ * The objective of relocation is to relocate all extents of the target block
+ * group to other block groups.
+ * This is utilized by resize (shrink only), profile converting, compacting
+ * space, or balance routine to spread chunks over devices.
+ *
+ * Before | After
+ * ------------------------------------------------------------------
+ * BG A: 10 data extents | BG A: deleted
+ * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
+ * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
+ *
+ * [How does relocation work]
+ *
+ * 1. Mark the target block group read-only
+ * New extents won't be allocated from the target block group.
+ *
+ * 2.1 Record each extent in the target block group
+ * To build a proper map of extents to be relocated.
+ *
+ * 2.2 Build data reloc tree and reloc trees
+ * Data reloc tree will contain an inode, recording all newly relocated
+ * data extents.
+ * There will be only one data reloc tree for one data block group.
+ *
+ * Reloc tree will be a special snapshot of its source tree, containing
+ * relocated tree blocks.
+ * Each tree referring to a tree block in target block group will get its
+ * reloc tree built.
+ *
+ * 2.3 Swap source tree with its corresponding reloc tree
+ * Each involved tree only refers to new extents after swap.
+ *
+ * 3. Cleanup reloc trees and data reloc tree.
+ * As old extents in the target block group are still referenced by reloc
+ * trees, we need to clean them up before really freeing the target block
+ * group.
+ *
+ * The main complexity is in steps 2.2 and 2.3.
+ *
+ * The entry point of relocation is relocate_block_group() function.
*/
-struct backref_edge {
- struct list_head list[2];
- struct backref_node *node[2];
-};
-#define LOWER 0
-#define UPPER 1
#define RELOCATION_RESERVED_NODES 256
-
-struct backref_cache {
- /* red black tree of all backref nodes in the cache */
- struct rb_root rb_root;
- /* for passing backref nodes to btrfs_reloc_cow_block */
- struct backref_node *path[BTRFS_MAX_LEVEL];
- /*
- * list of blocks that have been cowed but some block
- * pointers in upper level blocks may not reflect the
- * new location
- */
- struct list_head pending[BTRFS_MAX_LEVEL];
- /* list of backref nodes with no child node */
- struct list_head leaves;
- /* list of blocks that have been cowed in current transaction */
- struct list_head changed;
- /* list of detached backref node. */
- struct list_head detached;
-
- u64 last_trans;
-
- int nr_nodes;
- int nr_edges;
-};
-
/*
* map address of tree root to tree
*/
struct mapping_node {
- struct rb_node rb_node;
- u64 bytenr;
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simle_node for search/insert */
void *data;
};
@@ -129,8 +96,11 @@ struct mapping_tree {
* present a tree block to process
*/
struct tree_block {
- struct rb_node rb_node;
- u64 bytenr;
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ }; /* Use rb_simple_node for search/insert */
+ u64 owner;
struct btrfs_key key;
unsigned int level:8;
unsigned int key_ready:1;
@@ -155,7 +125,7 @@ struct reloc_control {
struct btrfs_block_rsv *block_rsv;
- struct backref_cache backref_cache;
+ struct btrfs_backref_cache backref_cache;
struct file_extent_cluster cluster;
/* tree blocks have been processed */
@@ -186,167 +156,41 @@ struct reloc_control {
#define MOVE_DATA_EXTENTS 0
#define UPDATE_DATA_PTRS 1
-static void remove_backref_node(struct backref_cache *cache,
- struct backref_node *node);
-static void __mark_block_processed(struct reloc_control *rc,
- struct backref_node *node);
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
- tree->rb_root = RB_ROOT;
- spin_lock_init(&tree->lock);
-}
-
-static void backref_cache_init(struct backref_cache *cache)
-{
- int i;
- cache->rb_root = RB_ROOT;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++)
- INIT_LIST_HEAD(&cache->pending[i]);
- INIT_LIST_HEAD(&cache->changed);
- INIT_LIST_HEAD(&cache->detached);
- INIT_LIST_HEAD(&cache->leaves);
-}
-
-static void backref_cache_cleanup(struct backref_cache *cache)
-{
- struct backref_node *node;
- int i;
-
- while (!list_empty(&cache->detached)) {
- node = list_entry(cache->detached.next,
- struct backref_node, list);
- remove_backref_node(cache, node);
- }
-
- while (!list_empty(&cache->leaves)) {
- node = list_entry(cache->leaves.next,
- struct backref_node, lower);
- remove_backref_node(cache, node);
- }
-
- cache->last_trans = 0;
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++)
- ASSERT(list_empty(&cache->pending[i]));
- ASSERT(list_empty(&cache->changed));
- ASSERT(list_empty(&cache->detached));
- ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
- ASSERT(!cache->nr_nodes);
- ASSERT(!cache->nr_edges);
-}
-
-static struct backref_node *alloc_backref_node(struct backref_cache *cache)
-{
- struct backref_node *node;
-
- node = kzalloc(sizeof(*node), GFP_NOFS);
- if (node) {
- INIT_LIST_HEAD(&node->list);
- INIT_LIST_HEAD(&node->upper);
- INIT_LIST_HEAD(&node->lower);
- RB_CLEAR_NODE(&node->rb_node);
- cache->nr_nodes++;
- }
- return node;
-}
-
-static void free_backref_node(struct backref_cache *cache,
- struct backref_node *node)
-{
- if (node) {
- cache->nr_nodes--;
- kfree(node);
- }
-}
-
-static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
-{
- struct backref_edge *edge;
-
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
- if (edge)
- cache->nr_edges++;
- return edge;
-}
-
-static void free_backref_edge(struct backref_cache *cache,
- struct backref_edge *edge)
+static void mark_block_processed(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
{
- if (edge) {
- cache->nr_edges--;
- kfree(edge);
- }
-}
+ u32 blocksize;
-static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
+ if (node->level == 0 ||
+ in_range(node->bytenr, rc->block_group->start,
+ rc->block_group->length)) {
+ blocksize = rc->extent_root->fs_info->nodesize;
+ set_extent_bits(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY);
}
-
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
+ node->processed = 1;
}
-static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
-{
- struct rb_node *n = root->rb_node;
- struct tree_entry *entry;
-
- while (n) {
- entry = rb_entry(n, struct tree_entry, rb_node);
-
- if (bytenr < entry->bytenr)
- n = n->rb_left;
- else if (bytenr > entry->bytenr)
- n = n->rb_right;
- else
- return n;
- }
- return NULL;
-}
-static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
+static void mapping_tree_init(struct mapping_tree *tree)
{
-
- struct btrfs_fs_info *fs_info = NULL;
- struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
- rb_node);
- if (bnode->root)
- fs_info = bnode->root->fs_info;
- btrfs_panic(fs_info, errno,
- "Inconsistency in backref cache found at offset %llu",
- bytenr);
+ tree->rb_root = RB_ROOT;
+ spin_lock_init(&tree->lock);
}
/*
* walk up backref nodes until reach node presents tree root
*/
-static struct backref_node *walk_up_backref(struct backref_node *node,
- struct backref_edge *edges[],
- int *index)
+static struct btrfs_backref_node *walk_up_backref(
+ struct btrfs_backref_node *node,
+ struct btrfs_backref_edge *edges[], int *index)
{
- struct backref_edge *edge;
+ struct btrfs_backref_edge *edge;
int idx = *index;
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[idx++] = edge;
node = edge->node[UPPER];
}
@@ -358,11 +202,11 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
/*
* walk down backref nodes to find start of next reference path
*/
-static struct backref_node *walk_down_backref(struct backref_edge *edges[],
- int *index)
+static struct btrfs_backref_node *walk_down_backref(
+ struct btrfs_backref_edge *edges[], int *index)
{
- struct backref_edge *edge;
- struct backref_node *lower;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *lower;
int idx = *index;
while (idx > 0) {
@@ -373,7 +217,7 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
continue;
}
edge = list_entry(edge->list[LOWER].next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[idx - 1] = edge;
*index = idx;
return edge->node[UPPER];
@@ -382,95 +226,24 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
return NULL;
}
-static void unlock_node_buffer(struct backref_node *node)
-{
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
-}
-
-static void drop_node_buffer(struct backref_node *node)
-{
- if (node->eb) {
- unlock_node_buffer(node);
- free_extent_buffer(node->eb);
- node->eb = NULL;
- }
-}
-
-static void drop_backref_node(struct backref_cache *tree,
- struct backref_node *node)
-{
- BUG_ON(!list_empty(&node->upper));
-
- drop_node_buffer(node);
- list_del(&node->list);
- list_del(&node->lower);
- if (!RB_EMPTY_NODE(&node->rb_node))
- rb_erase(&node->rb_node, &tree->rb_root);
- free_backref_node(tree, node);
-}
-
-/*
- * remove a backref node from the backref cache
- */
-static void remove_backref_node(struct backref_cache *cache,
- struct backref_node *node)
-{
- struct backref_node *upper;
- struct backref_edge *edge;
-
- if (!node)
- return;
-
- BUG_ON(!node->lowest && !node->detached);
- while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next, struct backref_edge,
- list[LOWER]);
- upper = edge->node[UPPER];
- list_del(&edge->list[LOWER]);
- list_del(&edge->list[UPPER]);
- free_backref_edge(cache, edge);
-
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- BUG_ON(!list_empty(&node->upper));
- drop_backref_node(cache, node);
- node = upper;
- node->lowest = 1;
- continue;
- }
- /*
- * add the node to leaf node list if no other
- * child block cached.
- */
- if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower, &cache->leaves);
- upper->lowest = 1;
- }
- }
-
- drop_backref_node(cache, node);
-}
-
-static void update_backref_node(struct backref_cache *cache,
- struct backref_node *node, u64 bytenr)
+static void update_backref_node(struct btrfs_backref_cache *cache,
+ struct btrfs_backref_node *node, u64 bytenr)
{
struct rb_node *rb_node;
rb_erase(&node->rb_node, &cache->rb_root);
node->bytenr = bytenr;
- rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, bytenr);
+ btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
}
/*
* update backref cache after a transaction commit
*/
static int update_backref_cache(struct btrfs_trans_handle *trans,
- struct backref_cache *cache)
+ struct btrfs_backref_cache *cache)
{
- struct backref_node *node;
+ struct btrfs_backref_node *node;
int level = 0;
if (cache->last_trans == 0) {
@@ -488,13 +261,13 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&cache->detached)) {
node = list_entry(cache->detached.next,
- struct backref_node, list);
- remove_backref_node(cache, node);
+ struct btrfs_backref_node, list);
+ btrfs_backref_cleanup_node(cache, node);
}
while (!list_empty(&cache->changed)) {
node = list_entry(cache->changed.next,
- struct backref_node, list);
+ struct btrfs_backref_node, list);
list_del_init(&node->list);
BUG_ON(node->pending);
update_backref_node(cache, node, node->new_bytenr);
@@ -535,7 +308,8 @@ static bool reloc_root_is_dead(struct btrfs_root *root)
*
* Reloc tree after swap is considered dead, thus not considered as valid.
* This is enough for most callers, as they don't distinguish dead reloc root
- * from no reloc root. But should_ignore_root() below is a special case.
+ * from no reloc root. But btrfs_should_ignore_reloc_root() below is a
+ * special case.
*/
static bool have_reloc_root(struct btrfs_root *root)
{
@@ -546,11 +320,11 @@ static bool have_reloc_root(struct btrfs_root *root)
return true;
}
-static int should_ignore_root(struct btrfs_root *root)
+int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
/* This root has been merged with its reloc tree, we can ignore it */
@@ -561,8 +335,8 @@ static int should_ignore_root(struct btrfs_root *root)
if (!reloc_root)
return 0;
- if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
- root->fs_info->running_transaction->transid - 1)
+ if (btrfs_header_generation(reloc_root->commit_root) ==
+ root->fs_info->running_transaction->transid)
return 0;
/*
* if there is reloc tree and it was created in previous
@@ -572,624 +346,187 @@ static int should_ignore_root(struct btrfs_root *root)
*/
return 1;
}
+
/*
* find reloc tree by address of tree root
*/
-static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
- u64 bytenr)
+struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
+ struct reloc_control *rc = fs_info->reloc_ctl;
struct rb_node *rb_node;
struct mapping_node *node;
struct btrfs_root *root = NULL;
+ ASSERT(rc);
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
- root = (struct btrfs_root *)node->data;
+ root = node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
- return root;
+ return btrfs_grab_root(root);
}
-static int is_cowonly_root(u64 root_objectid)
+/*
+ * For useless nodes, do two major clean ups:
+ *
+ * - Cleanup the children edges and nodes
+ * If child node is also orphan (no parent) during cleanup, then the child
+ * node will also be cleaned up.
+ *
+ * - Freeing up leaves (level 0), keeps nodes detached
+ * For nodes, the node is still cached as "detached"
+ *
+ * Return false if @node is not in the @useless_nodes list.
+ * Return true if @node is in the @useless_nodes list.
+ */
+static bool handle_useless_nodes(struct reloc_control *rc,
+ struct btrfs_backref_node *node)
{
- if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
- root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
- root_objectid == BTRFS_DEV_TREE_OBJECTID ||
- root_objectid == BTRFS_TREE_LOG_OBJECTID ||
- root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root_objectid == BTRFS_UUID_TREE_OBJECTID ||
- root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
- root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return 1;
- return 0;
-}
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct list_head *useless_node = &cache->useless_node;
+ bool ret = false;
-static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_objectid)
-{
- struct btrfs_key key;
+ while (!list_empty(useless_node)) {
+ struct btrfs_backref_node *cur;
- key.objectid = root_objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- if (is_cowonly_root(root_objectid))
- key.offset = 0;
- else
- key.offset = (u64)-1;
+ cur = list_first_entry(useless_node, struct btrfs_backref_node,
+ list);
+ list_del_init(&cur->list);
- return btrfs_get_fs_root(fs_info, &key, false);
-}
+ /* Only tree root nodes can be added to @useless_nodes */
+ ASSERT(list_empty(&cur->upper));
-static noinline_for_stack
-int find_inline_backref(struct extent_buffer *leaf, int slot,
- unsigned long *ptr, unsigned long *end)
-{
- struct btrfs_key key;
- struct btrfs_extent_item *ei;
- struct btrfs_tree_block_info *bi;
- u32 item_size;
+ if (cur == node)
+ ret = true;
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* The node is the lowest node */
+ if (cur->lowest) {
+ list_del_init(&cur->lower);
+ cur->lowest = 0;
+ }
- item_size = btrfs_item_size_nr(leaf, slot);
- if (item_size < sizeof(*ei)) {
- btrfs_print_v0_err(leaf->fs_info);
- btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL);
- return 1;
- }
- ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
- WARN_ON(!(btrfs_extent_flags(leaf, ei) &
- BTRFS_EXTENT_FLAG_TREE_BLOCK));
+ /* Cleanup the lower edges */
+ while (!list_empty(&cur->lower)) {
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_node *lower;
- if (key.type == BTRFS_EXTENT_ITEM_KEY &&
- item_size <= sizeof(*ei) + sizeof(*bi)) {
- WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
- return 1;
- }
- if (key.type == BTRFS_METADATA_ITEM_KEY &&
- item_size <= sizeof(*ei)) {
- WARN_ON(item_size < sizeof(*ei));
- return 1;
- }
+ edge = list_entry(cur->lower.next,
+ struct btrfs_backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ btrfs_backref_free_edge(cache, edge);
- if (key.type == BTRFS_EXTENT_ITEM_KEY) {
- bi = (struct btrfs_tree_block_info *)(ei + 1);
- *ptr = (unsigned long)(bi + 1);
- } else {
- *ptr = (unsigned long)(ei + 1);
+ /* Child node is also orphan, queue for cleanup */
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, useless_node);
+ }
+ /* Mark this block processed for relocation */
+ mark_block_processed(rc, cur);
+
+ /*
+ * Backref nodes for tree leaves are deleted from the cache.
+ * Backref nodes for upper level tree blocks are left in the
+ * cache to avoid unnecessary backref lookup.
+ */
+ if (cur->level > 0) {
+ list_add(&cur->list, &cache->detached);
+ cur->detached = 1;
+ } else {
+ rb_erase(&cur->rb_node, &cache->rb_root);
+ btrfs_backref_free_node(cache, cur);
+ }
}
- *end = (unsigned long)ei + item_size;
- return 0;
+ return ret;
}
/*
- * build backref tree for a given tree block. root of the backref tree
- * corresponds the tree block, leaves of the backref tree correspond
- * roots of b-trees that reference the tree block.
+ * Build backref tree for a given tree block. Root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond roots of
+ * b-trees that reference the tree block.
*
- * the basic idea of this function is check backrefs of a given block
- * to find upper level blocks that reference the block, and then check
- * backrefs of these upper level blocks recursively. the recursion stop
- * when tree root is reached or backrefs for the block is cached.
+ * The basic idea of this function is check backrefs of a given block to find
+ * upper level blocks that reference the block, and then check backrefs of
+ * these upper level blocks recursively. The recursion stops when tree root is
+ * reached or backrefs for the block is cached.
*
- * NOTE: if we find backrefs for a block are cached, we know backrefs
- * for all upper level blocks that directly/indirectly reference the
- * block are also cached.
+ * NOTE: if we find that backrefs for a block are cached, we know backrefs for
+ * all upper level blocks that directly/indirectly reference the block are also
+ * cached.
*/
-static noinline_for_stack
-struct backref_node *build_backref_tree(struct reloc_control *rc,
- struct btrfs_key *node_key,
- int level, u64 bytenr)
+static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
+ struct reloc_control *rc, struct btrfs_key *node_key,
+ int level, u64 bytenr)
{
- struct backref_cache *cache = &rc->backref_cache;
- struct btrfs_path *path1; /* For searching extent root */
- struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
- struct extent_buffer *eb;
- struct btrfs_root *root;
- struct backref_node *cur;
- struct backref_node *upper;
- struct backref_node *lower;
- struct backref_node *node = NULL;
- struct backref_node *exist = NULL;
- struct backref_edge *edge;
- struct rb_node *rb_node;
- struct btrfs_key key;
- unsigned long end;
- unsigned long ptr;
- LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
- LIST_HEAD(useless);
- int cowonly;
+ struct btrfs_backref_iter *iter;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ /* For searching parent of TREE_BLOCK_REF */
+ struct btrfs_path *path;
+ struct btrfs_backref_node *cur;
+ struct btrfs_backref_node *node = NULL;
+ struct btrfs_backref_edge *edge;
int ret;
int err = 0;
- bool need_check = true;
- path1 = btrfs_alloc_path();
- path2 = btrfs_alloc_path();
- if (!path1 || !path2) {
+ iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+ path = btrfs_alloc_path();
+ if (!path) {
err = -ENOMEM;
goto out;
}
- path1->reada = READA_FORWARD;
- path2->reada = READA_FORWARD;
- node = alloc_backref_node(cache);
+ node = btrfs_backref_alloc_node(cache, bytenr, level);
if (!node) {
err = -ENOMEM;
goto out;
}
- node->bytenr = bytenr;
- node->level = level;
node->lowest = 1;
cur = node;
-again:
- end = 0;
- ptr = 0;
- key.objectid = cur->bytenr;
- key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = (u64)-1;
-
- path1->search_commit_root = 1;
- path1->skip_locking = 1;
- ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
- 0, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- ASSERT(ret);
- ASSERT(path1->slots[0]);
-
- path1->slots[0]--;
-
- WARN_ON(cur->checked);
- if (!list_empty(&cur->upper)) {
- /*
- * the backref was added previously when processing
- * backref of type BTRFS_TREE_BLOCK_REF_KEY
- */
- ASSERT(list_is_singular(&cur->upper));
- edge = list_entry(cur->upper.next, struct backref_edge,
- list[LOWER]);
- ASSERT(list_empty(&edge->list[UPPER]));
- exist = edge->node[UPPER];
- /*
- * add the upper level block to pending list if we need
- * check its backrefs
- */
- if (!exist->checked)
- list_add_tail(&edge->list[UPPER], &list);
- } else {
- exist = NULL;
- }
-
- while (1) {
- cond_resched();
- eb = path1->nodes[0];
-
- if (ptr >= end) {
- if (path1->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(rc->extent_root, path1);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (ret > 0)
- break;
- eb = path1->nodes[0];
- }
-
- btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
- if (key.objectid != cur->bytenr) {
- WARN_ON(exist);
- break;
- }
-
- if (key.type == BTRFS_EXTENT_ITEM_KEY ||
- key.type == BTRFS_METADATA_ITEM_KEY) {
- ret = find_inline_backref(eb, path1->slots[0],
- &ptr, &end);
- if (ret)
- goto next;
- }
- }
-
- if (ptr < end) {
- /* update key for inline back ref */
- struct btrfs_extent_inline_ref *iref;
- int type;
- iref = (struct btrfs_extent_inline_ref *)ptr;
- type = btrfs_get_extent_inline_ref_type(eb, iref,
- BTRFS_REF_TYPE_BLOCK);
- if (type == BTRFS_REF_TYPE_INVALID) {
- err = -EUCLEAN;
- goto out;
- }
- key.type = type;
- key.offset = btrfs_extent_inline_ref_offset(eb, iref);
-
- WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
- key.type != BTRFS_SHARED_BLOCK_REF_KEY);
- }
-
- /*
- * Parent node found and matches current inline ref, no need to
- * rebuild this node for this inline ref.
- */
- if (exist &&
- ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
- exist->owner == key.offset) ||
- (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
- exist->bytenr == key.offset))) {
- exist = NULL;
- goto next;
- }
-
- /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
- if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
- if (key.objectid == key.offset) {
- /*
- * Only root blocks of reloc trees use backref
- * pointing to itself.
- */
- root = find_reloc_root(rc, cur->bytenr);
- ASSERT(root);
- cur->root = root;
- break;
- }
-
- edge = alloc_backref_edge(cache);
- if (!edge) {
- err = -ENOMEM;
- goto out;
- }
- rb_node = tree_search(&cache->rb_root, key.offset);
- if (!rb_node) {
- upper = alloc_backref_node(cache);
- if (!upper) {
- free_backref_edge(cache, edge);
- err = -ENOMEM;
- goto out;
- }
- upper->bytenr = key.offset;
- upper->level = cur->level + 1;
- /*
- * backrefs for the upper level block isn't
- * cached, add the block to pending list
- */
- list_add_tail(&edge->list[UPPER], &list);
- } else {
- upper = rb_entry(rb_node, struct backref_node,
- rb_node);
- ASSERT(upper->checked);
- INIT_LIST_HEAD(&edge->list[UPPER]);
- }
- list_add_tail(&edge->list[LOWER], &cur->upper);
- edge->node[LOWER] = cur;
- edge->node[UPPER] = upper;
-
- goto next;
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- err = -EINVAL;
- btrfs_print_v0_err(rc->extent_root->fs_info);
- btrfs_handle_fs_error(rc->extent_root->fs_info, err,
- NULL);
- goto out;
- } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
- goto next;
- }
-
- /*
- * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
- * means the root objectid. We need to search the tree to get
- * its parent bytenr.
- */
- root = read_fs_root(rc->extent_root->fs_info, key.offset);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
-
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- cur->cowonly = 1;
-
- if (btrfs_root_level(&root->root_item) == cur->level) {
- /* tree root */
- ASSERT(btrfs_root_bytenr(&root->root_item) ==
- cur->bytenr);
- if (should_ignore_root(root))
- list_add(&cur->list, &useless);
- else
- cur->root = root;
- break;
- }
-
- level = cur->level + 1;
- /* Search the tree to find parent blocks referring the block. */
- path2->search_commit_root = 1;
- path2->skip_locking = 1;
- path2->lowest_level = level;
- ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
- path2->lowest_level = 0;
+ /* Breadth-first search to build backref cache */
+ do {
+ ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
+ cur);
if (ret < 0) {
err = ret;
goto out;
}
- if (ret > 0 && path2->slots[level] > 0)
- path2->slots[level]--;
-
- eb = path2->nodes[level];
- if (btrfs_node_blockptr(eb, path2->slots[level]) !=
- cur->bytenr) {
- btrfs_err(root->fs_info,
- "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1,
- root->root_key.objectid,
- node_key->objectid, node_key->type,
- node_key->offset);
- err = -ENOENT;
- goto out;
- }
- lower = cur;
- need_check = true;
-
- /* Add all nodes and edges in the path */
- for (; level < BTRFS_MAX_LEVEL; level++) {
- if (!path2->nodes[level]) {
- ASSERT(btrfs_root_bytenr(&root->root_item) ==
- lower->bytenr);
- if (should_ignore_root(root))
- list_add(&lower->list, &useless);
- else
- lower->root = root;
- break;
- }
-
- edge = alloc_backref_edge(cache);
- if (!edge) {
- err = -ENOMEM;
- goto out;
- }
-
- eb = path2->nodes[level];
- rb_node = tree_search(&cache->rb_root, eb->start);
- if (!rb_node) {
- upper = alloc_backref_node(cache);
- if (!upper) {
- free_backref_edge(cache, edge);
- err = -ENOMEM;
- goto out;
- }
- upper->bytenr = eb->start;
- upper->owner = btrfs_header_owner(eb);
- upper->level = lower->level + 1;
- if (!test_bit(BTRFS_ROOT_REF_COWS,
- &root->state))
- upper->cowonly = 1;
-
- /*
- * if we know the block isn't shared
- * we can void checking its backrefs.
- */
- if (btrfs_block_can_be_shared(root, eb))
- upper->checked = 0;
- else
- upper->checked = 1;
-
- /*
- * add the block to pending list if we
- * need check its backrefs, we only do this once
- * while walking up a tree as we will catch
- * anything else later on.
- */
- if (!upper->checked && need_check) {
- need_check = false;
- list_add_tail(&edge->list[UPPER],
- &list);
- } else {
- if (upper->checked)
- need_check = true;
- INIT_LIST_HEAD(&edge->list[UPPER]);
- }
- } else {
- upper = rb_entry(rb_node, struct backref_node,
- rb_node);
- ASSERT(upper->checked);
- INIT_LIST_HEAD(&edge->list[UPPER]);
- if (!upper->owner)
- upper->owner = btrfs_header_owner(eb);
- }
- list_add_tail(&edge->list[LOWER], &lower->upper);
- edge->node[LOWER] = lower;
- edge->node[UPPER] = upper;
-
- if (rb_node)
- break;
- lower = upper;
- upper = NULL;
- }
- btrfs_release_path(path2);
-next:
- if (ptr < end) {
- ptr += btrfs_extent_inline_ref_size(key.type);
- if (ptr >= end) {
- WARN_ON(ptr > end);
- ptr = 0;
- end = 0;
- }
+ edge = list_first_entry_or_null(&cache->pending_edge,
+ struct btrfs_backref_edge, list[UPPER]);
+ /*
+ * The pending list isn't empty, take the first block to
+ * process
+ */
+ if (edge) {
+ list_del_init(&edge->list[UPPER]);
+ cur = edge->node[UPPER];
}
- if (ptr >= end)
- path1->slots[0]++;
- }
- btrfs_release_path(path1);
-
- cur->checked = 1;
- WARN_ON(exist);
-
- /* the pending list isn't empty, take the first block to process */
- if (!list_empty(&list)) {
- edge = list_entry(list.next, struct backref_edge, list[UPPER]);
- list_del_init(&edge->list[UPPER]);
- cur = edge->node[UPPER];
- goto again;
- }
+ } while (edge);
- /*
- * everything goes well, connect backref nodes and insert backref nodes
- * into the cache.
- */
- ASSERT(node->checked);
- cowonly = node->cowonly;
- if (!cowonly) {
- rb_node = tree_insert(&cache->rb_root, node->bytenr,
- &node->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, node->bytenr);
- list_add_tail(&node->lower, &cache->leaves);
+ /* Finish the upper linkage of newly added edges/nodes */
+ ret = btrfs_backref_finish_upper_links(cache, node);
+ if (ret < 0) {
+ err = ret;
+ goto out;
}
- list_for_each_entry(edge, &node->upper, list[LOWER])
- list_add_tail(&edge->list[UPPER], &list);
-
- while (!list_empty(&list)) {
- edge = list_entry(list.next, struct backref_edge, list[UPPER]);
- list_del_init(&edge->list[UPPER]);
- upper = edge->node[UPPER];
- if (upper->detached) {
- list_del(&edge->list[LOWER]);
- lower = edge->node[LOWER];
- free_backref_edge(cache, edge);
- if (list_empty(&lower->upper))
- list_add(&lower->list, &useless);
- continue;
- }
-
- if (!RB_EMPTY_NODE(&upper->rb_node)) {
- if (upper->lowest) {
- list_del_init(&upper->lower);
- upper->lowest = 0;
- }
-
- list_add_tail(&edge->list[UPPER], &upper->lower);
- continue;
- }
-
- if (!upper->checked) {
- /*
- * Still want to blow up for developers since this is a
- * logic bug.
- */
- ASSERT(0);
- err = -EINVAL;
- goto out;
- }
- if (cowonly != upper->cowonly) {
- ASSERT(0);
- err = -EINVAL;
- goto out;
- }
-
- if (!cowonly) {
- rb_node = tree_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST,
- upper->bytenr);
- }
-
- list_add_tail(&edge->list[UPPER], &upper->lower);
-
- list_for_each_entry(edge, &upper->upper, list[LOWER])
- list_add_tail(&edge->list[UPPER], &list);
- }
- /*
- * process useless backref nodes. backref nodes for tree leaves
- * are deleted from the cache. backref nodes for upper level
- * tree blocks are left in the cache to avoid unnecessary backref
- * lookup.
- */
- while (!list_empty(&useless)) {
- upper = list_entry(useless.next, struct backref_node, list);
- list_del_init(&upper->list);
- ASSERT(list_empty(&upper->upper));
- if (upper == node)
- node = NULL;
- if (upper->lowest) {
- list_del_init(&upper->lower);
- upper->lowest = 0;
- }
- while (!list_empty(&upper->lower)) {
- edge = list_entry(upper->lower.next,
- struct backref_edge, list[UPPER]);
- list_del(&edge->list[UPPER]);
- list_del(&edge->list[LOWER]);
- lower = edge->node[LOWER];
- free_backref_edge(cache, edge);
-
- if (list_empty(&lower->upper))
- list_add(&lower->list, &useless);
- }
- __mark_block_processed(rc, upper);
- if (upper->level > 0) {
- list_add(&upper->list, &cache->detached);
- upper->detached = 1;
- } else {
- rb_erase(&upper->rb_node, &cache->rb_root);
- free_backref_node(cache, upper);
- }
- }
+ if (handle_useless_nodes(rc, node))
+ node = NULL;
out:
- btrfs_free_path(path1);
- btrfs_free_path(path2);
+ btrfs_backref_iter_free(iter);
+ btrfs_free_path(path);
if (err) {
- while (!list_empty(&useless)) {
- lower = list_entry(useless.next,
- struct backref_node, list);
- list_del_init(&lower->list);
- }
- while (!list_empty(&list)) {
- edge = list_first_entry(&list, struct backref_edge,
- list[UPPER]);
- list_del(&edge->list[UPPER]);
- list_del(&edge->list[LOWER]);
- lower = edge->node[LOWER];
- upper = edge->node[UPPER];
- free_backref_edge(cache, edge);
-
- /*
- * Lower is no longer linked to any upper backref nodes
- * and isn't in the cache, we can free it ourselves.
- */
- if (list_empty(&lower->upper) &&
- RB_EMPTY_NODE(&lower->rb_node))
- list_add(&lower->list, &useless);
-
- if (!RB_EMPTY_NODE(&upper->rb_node))
- continue;
-
- /* Add this guy's upper edges to the list to process */
- list_for_each_entry(edge, &upper->upper, list[LOWER])
- list_add_tail(&edge->list[UPPER], &list);
- if (list_empty(&upper->upper))
- list_add(&upper->list, &useless);
- }
-
- while (!list_empty(&useless)) {
- lower = list_entry(useless.next,
- struct backref_node, list);
- list_del_init(&lower->list);
- if (lower == node)
- node = NULL;
- free_backref_node(cache, lower);
- }
-
- free_backref_node(cache, node);
+ btrfs_backref_error_cleanup(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
+ ASSERT(list_empty(&cache->useless_node) &&
+ list_empty(&cache->pending_edge));
return node;
}
@@ -1204,19 +541,19 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
struct btrfs_root *dest)
{
struct btrfs_root *reloc_root = src->reloc_root;
- struct backref_cache *cache = &rc->backref_cache;
- struct backref_node *node = NULL;
- struct backref_node *new_node;
- struct backref_edge *edge;
- struct backref_edge *new_edge;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct btrfs_backref_node *node = NULL;
+ struct btrfs_backref_node *new_node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *new_edge;
struct rb_node *rb_node;
if (cache->last_trans > 0)
update_backref_cache(trans, cache);
- rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+ rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
if (rb_node) {
- node = rb_entry(rb_node, struct backref_node, rb_node);
+ node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
if (node->detached)
node = NULL;
else
@@ -1224,10 +561,10 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
}
if (!node) {
- rb_node = tree_search(&cache->rb_root,
- reloc_root->commit_root->start);
+ rb_node = rb_simple_search(&cache->rb_root,
+ reloc_root->commit_root->start);
if (rb_node) {
- node = rb_entry(rb_node, struct backref_node,
+ node = rb_entry(rb_node, struct btrfs_backref_node,
rb_node);
BUG_ON(node->detached);
}
@@ -1236,35 +573,33 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
if (!node)
return 0;
- new_node = alloc_backref_node(cache);
+ new_node = btrfs_backref_alloc_node(cache, dest->node->start,
+ node->level);
if (!new_node)
return -ENOMEM;
- new_node->bytenr = dest->node->start;
- new_node->level = node->level;
new_node->lowest = node->lowest;
new_node->checked = 1;
- new_node->root = dest;
+ new_node->root = btrfs_grab_root(dest);
+ ASSERT(new_node->root);
if (!node->lowest) {
list_for_each_entry(edge, &node->lower, list[UPPER]) {
- new_edge = alloc_backref_edge(cache);
+ new_edge = btrfs_backref_alloc_edge(cache);
if (!new_edge)
goto fail;
- new_edge->node[UPPER] = new_node;
- new_edge->node[LOWER] = edge->node[LOWER];
- list_add_tail(&new_edge->list[UPPER],
- &new_node->lower);
+ btrfs_backref_link_edge(new_edge, edge->node[LOWER],
+ new_node, LINK_UPPER);
}
} else {
list_add_tail(&new_node->lower, &cache->leaves);
}
- rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
- &new_node->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
+ btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
if (!new_node->lowest) {
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
@@ -1276,11 +611,11 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
fail:
while (!list_empty(&new_node->lower)) {
new_edge = list_entry(new_node->lower.next,
- struct backref_edge, list[UPPER]);
+ struct btrfs_backref_edge, list[UPPER]);
list_del(&new_edge->list[UPPER]);
- free_backref_edge(cache, new_edge);
+ btrfs_backref_free_edge(cache, new_edge);
}
- free_backref_node(cache, new_node);
+ btrfs_backref_free_node(cache, new_node);
return -ENOMEM;
}
@@ -1298,17 +633,18 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (!node)
return -ENOMEM;
- node->bytenr = root->node->start;
+ node->bytenr = root->commit_root->start;
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
- btrfs_panic(fs_info, -EEXIST,
+ btrfs_err(fs_info,
"Duplicate root found for start=%llu while inserting into relocation tree",
node->bytenr);
+ return -EEXIST;
}
list_add_tail(&root->root_list, &rc->reloc_roots);
@@ -1325,24 +661,37 @@ static void __del_reloc_root(struct btrfs_root *root)
struct rb_node *rb_node;
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
+ bool put_ref = false;
if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
- if (!node)
- return;
- BUG_ON((struct btrfs_root *)node->data != root);
+ ASSERT(!node || (struct btrfs_root *)node->data == root);
}
+ /*
+ * We only put the reloc root here if it's on the list. There's a lot
+ * of places where the pattern is to splice the rc->reloc_roots, process
+ * the reloc roots, and then add the reloc root back onto
+ * rc->reloc_roots. If we call __del_reloc_root while it's off of the
+ * list we don't want the reference being dropped, because the guy
+ * messing with the list is in charge of the reference.
+ */
spin_lock(&fs_info->trans_lock);
- list_del_init(&root->root_list);
+ if (!list_empty(&root->root_list)) {
+ put_ref = true;
+ list_del_init(&root->root_list);
+ }
spin_unlock(&fs_info->trans_lock);
+ if (put_ref)
+ btrfs_put_root(root);
kfree(node);
}
@@ -1350,7 +699,7 @@ static void __del_reloc_root(struct btrfs_root *root)
* helper to update the 'address of tree root -> reloc tree'
* mapping
*/
-static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+static int __update_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
@@ -1358,8 +707,8 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
struct reloc_control *rc = fs_info->reloc_ctl;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1371,12 +720,12 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
BUG_ON((struct btrfs_root *)node->data != root);
spin_lock(&rc->reloc_root_tree.lock);
- node->bytenr = new_bytenr;
- rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ node->bytenr = root->node->start;
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, node->bytenr);
+ btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
return 0;
}
@@ -1388,10 +737,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
- int ret;
+ int ret = 0;
+ bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
- BUG_ON(!root_item);
+ if (!root_item)
+ return ERR_PTR(-ENOMEM);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1403,7 +754,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
/*
* Set the last_snapshot field to the generation of the commit
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
@@ -1424,9 +777,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
*/
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
}
+ /*
+ * We have changed references at this point, we must abort the
+ * transaction if anything fails.
+ */
+ must_abort = true;
+
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
@@ -1436,7 +796,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
- root_item->drop_level = 0;
+ btrfs_set_root_drop_level(root_item, 0);
}
btrfs_tree_unlock(eb);
@@ -1444,18 +804,33 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
kfree(root_item);
- reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
- BUG_ON(IS_ERR(reloc_root));
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
+ if (IS_ERR(reloc_root)) {
+ ret = PTR_ERR(reloc_root);
+ goto abort;
+ }
+ set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
+fail:
+ kfree(root_item);
+abort:
+ if (must_abort)
+ btrfs_abort_transaction(trans, ret);
+ return ERR_PTR(ret);
}
/*
* create reloc tree for a given fs tree. reloc tree is just a
* snapshot of the fs tree with special root objectid.
+ *
+ * The reloc_root comes out of here with two references, one for
+ * root->reloc_root, and another for being on the rc->reloc_roots list.
*/
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -1467,6 +842,9 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
+ if (!rc)
+ return 0;
+
/*
* The subvolume has reloc tree but the swap is finished, no need to
* create/update the dead reloc tree
@@ -1474,13 +852,25 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
if (reloc_root_is_dead(root))
return 0;
+ /*
+ * This is subtle but important. We do not do
+ * record_root_in_transaction for reloc roots, instead we record their
+ * corresponding fs root, and then here we update the last trans for the
+ * reloc root. This means that we have to do this for the entire life
+ * of the reloc root, regardless of which stage of the relocation we are
+ * in.
+ */
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid;
return 0;
}
- if (!rc || !rc->create_reloc_tree ||
+ /*
+ * We are merging reloc roots, we do not need new reloc trees. Also
+ * reloc trees never need their own reloc tree.
+ */
+ if (!rc->create_reloc_tree ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
return 0;
@@ -1492,10 +882,17 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
trans->block_rsv = rsv;
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
- BUG_ON(ret < 0);
- root->reloc_root = reloc_root;
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
+ root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
@@ -1511,11 +908,18 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
int ret;
if (!have_reloc_root(root))
- goto out;
+ return 0;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /*
+ * We are probably ok here, but __del_reloc_root() will drop its ref of
+ * the root. We have the ref for root->reloc_root, but just in case
+ * hold it while we update the reloc root.
+ */
+ btrfs_grab_root(reloc_root);
+
/* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
@@ -1529,6 +933,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
if (reloc_root->commit_root != reloc_root->node) {
+ __update_reloc_root(reloc_root);
btrfs_set_root_node(root_item, reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
reloc_root->commit_root = btrfs_root_node(reloc_root);
@@ -1536,10 +941,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
- BUG_ON(ret);
-
-out:
- return 0;
+ btrfs_put_root(reloc_root);
+ return ret;
}
/*
@@ -1596,14 +999,6 @@ again:
return NULL;
}
-static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group)
-{
- if (bytenr >= block_group->start &&
- bytenr < block_group->start + block_group->length)
- return 1;
- return 0;
-}
-
/*
* get new location of data
*/
@@ -1701,11 +1096,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
if (bytenr == 0)
continue;
- if (!in_block_group(bytenr, rc->block_group))
+ if (!in_range(bytenr, rc->block_group->start,
+ rc->block_group->length))
continue;
/*
- * if we are modifying block in fs tree, wait for readpage
+ * if we are modifying block in fs tree, wait for read_folio
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
@@ -1728,10 +1124,10 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (!ret)
continue;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- key.offset, end, 1);
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ key.offset, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end);
+ key.offset, end, NULL);
}
}
@@ -1751,9 +1147,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1762,9 +1158,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, parent);
- ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset,
+ root->root_key.objectid, false);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1820,8 +1216,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1829,7 +1225,6 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1839,10 +1234,14 @@ again:
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
- BUG_ON(ret);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return ret;
+ }
}
- btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1852,12 +1251,10 @@ again:
parent = eb;
while (1) {
- struct btrfs_key first_key;
-
level = btrfs_header_level(parent);
- BUG_ON(level < lowest_level);
+ ASSERT(level >= lowest_level);
- ret = btrfs_bin_search(parent, &key, level, &slot);
+ ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
break;
if (ret && slot > 0)
@@ -1869,7 +1266,6 @@ again:
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
- btrfs_node_key_to_cpu(parent, &first_key, slot);
if (level <= max_level) {
eb = path->nodes[level];
@@ -1894,23 +1290,22 @@ again:
break;
}
- eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
- level - 1, &first_key);
+ eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
- } else if (!extent_buffer_uptodate(eb)) {
- ret = -EIO;
- free_extent_buffer(eb);
- break;
}
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
- BUG_ON(ret);
+ slot, &eb,
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ break;
+ }
}
- btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1931,9 +1326,15 @@ again:
btrfs_release_path(path);
path->lowest_level = level;
+ set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+ clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
path->lowest_level = 0;
- BUG_ON(ret);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ break;
+ }
/*
* Info qgroup to trace both subtrees.
@@ -1970,30 +1371,42 @@ again:
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
blocksize, path->nodes[level]->start);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
- ref.skip_qgroup = true;
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+ true);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
- btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
- btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
- ref.skip_qgroup = true;
+ btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+ 0, true);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_unlock_up_safe(path, 0);
@@ -2049,10 +1462,8 @@ static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
int *level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb = NULL;
int i;
- u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
u32 nritems;
@@ -2060,8 +1471,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = *level; i > 0; i--) {
- struct btrfs_key first_key;
-
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] < nritems) {
@@ -2081,16 +1490,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
return 0;
}
- bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
- eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
- &first_key);
- if (IS_ERR(eb)) {
+ eb = btrfs_read_node_slot(eb, path->slots[i]);
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- return -EIO;
- }
BUG_ON(btrfs_header_level(eb) != i - 1);
path->nodes[i - 1] = eb;
path->slots[i - 1] = 0;
@@ -2163,10 +1565,10 @@ static int invalidate_extent_cache(struct btrfs_root *root,
end = (u64)-1;
}
- /* the lock_extent waits for readpage to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ /* the lock_extent waits for read_folio to complete */
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
}
return 0;
}
@@ -2192,12 +1594,13 @@ static int find_next_key(struct btrfs_path *path, int level,
/*
* Insert current subvolume into reloc_control::dirty_subvol_roots
*/
-static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_root *root)
+static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
{
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
+ int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
@@ -2206,14 +1609,18 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
reloc_root_item = &reloc_root->root_item;
memset(&reloc_root_item->drop_progress, 0,
sizeof(reloc_root_item->drop_progress));
- reloc_root_item->drop_level = 0;
+ btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
- btrfs_update_reloc_root(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ if (ret)
+ return ret;
if (list_empty(&root->reloc_dirty_list)) {
- btrfs_grab_fs_root(root);
+ btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
+
+ return 0;
}
static int clean_dirty_subvols(struct reloc_control *rc)
@@ -2231,24 +1638,34 @@ static int clean_dirty_subvols(struct reloc_control *rc)
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
- if (reloc_root) {
-
- ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
- if (ret2 < 0 && !ret)
- ret = ret2;
- }
/*
* Need barrier to ensure clear_bit() only happens after
* root->reloc_root = NULL. Pairs with have_reloc_root.
*/
smp_wmb();
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
- btrfs_put_fs_root(root);
+ if (reloc_root) {
+ /*
+ * btrfs_drop_snapshot drops our ref we hold for
+ * ->reloc_root. If it fails however we must
+ * drop the ref ourselves.
+ */
+ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(reloc_root);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+ btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
- ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
- if (ret2 < 0 && !ret)
- ret = ret2;
+ ret2 = btrfs_drop_snapshot(root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(root);
+ if (!ret)
+ ret = ret2;
+ }
}
}
return ret;
@@ -2269,11 +1686,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ int reserve_level;
int level;
int max_level;
int replaced = 0;
- int ret;
- int err = 0;
+ int ret = 0;
u32 min_reserved;
path = btrfs_alloc_path();
@@ -2292,7 +1709,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
} else {
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
@@ -2309,32 +1726,50 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ /*
+ * In merge_reloc_root(), we modify the upper level pointer to swap the
+ * tree blocks between reloc tree and subvolume tree. Thus for tree
+ * block COW, we COW at most from level 1 to root level for each tree.
+ *
+ * Thus the needed metadata size is at most root_level * nodesize,
+ * and * 2 since we have two trees to COW.
+ */
+ reserve_level = max_t(int, 1, btrfs_root_level(root_item));
+ min_reserved = fs_info->nodesize * reserve_level * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
- BTRFS_RESERVE_FLUSH_ALL);
- if (ret) {
- err = ret;
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ min_reserved,
+ BTRFS_RESERVE_FLUSH_LIMIT);
+ if (ret)
goto out;
- }
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
+
+ /*
+ * At this point we no longer have a reloc_control, so we can't
+ * depend on btrfs_init_reloc_root to update our last_trans.
+ *
+ * But that's ok, we started the trans handle on our
+ * corresponding fs_root, which means it's been added to the
+ * dirty list. At commit time we'll still call
+ * btrfs_update_reloc_root() and update our root item
+ * appropriately.
+ */
+ reloc_root->last_trans = trans->transid;
trans->block_rsv = rc->block_rsv;
replaced = 0;
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0)
break;
@@ -2345,11 +1780,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
-
if (ret > 0) {
level = ret;
btrfs_node_key_to_cpu(path->nodes[level], &key,
@@ -2368,7 +1800,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
*/
btrfs_node_key(path->nodes[level], &root_item->drop_progress,
path->slots[level]);
- root_item->drop_level = level;
+ btrfs_set_root_drop_level(root_item, level);
btrfs_end_transaction_throttle(trans);
trans = NULL;
@@ -2384,16 +1816,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
- if (ret < 0)
- err = ret;
out:
btrfs_free_path(path);
- if (err == 0)
- insert_dirty_subvol(trans, rc, root);
+ if (ret == 0) {
+ ret = insert_dirty_subvol(trans, rc, root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -2403,7 +1837,7 @@ out:
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
- return err;
+ return ret;
}
static noinline_for_stack
@@ -2425,7 +1859,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
err = ret;
@@ -2435,7 +1869,7 @@ again:
if (IS_ERR(trans)) {
if (!err)
btrfs_block_rsv_release(fs_info, rc->block_rsv,
- num_bytes);
+ num_bytes, NULL);
return PTR_ERR(trans);
}
@@ -2443,7 +1877,7 @@ again:
if (num_bytes != rc->merging_rsv_size) {
btrfs_end_transaction(trans);
btrfs_block_rsv_release(fs_info, rc->block_rsv,
- num_bytes);
+ num_bytes, NULL);
goto again;
}
}
@@ -2455,9 +1889,20 @@ again:
struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
- root = read_fs_root(fs_info, reloc_root->root_key.offset);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
+ if (IS_ERR(root)) {
+ /*
+ * Even if we have an error we need this reloc root
+ * back on our list so we can clean up properly.
+ */
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_abort_transaction(trans, (int)PTR_ERR(root));
+ if (!err)
+ err = PTR_ERR(root);
+ break;
+ }
+ ASSERT(root->reloc_root == reloc_root);
/*
* set reference count to 1, so btrfs_recover_relocation
@@ -2465,15 +1910,27 @@ again:
*/
if (!err)
btrfs_set_root_refs(&reloc_root->root_item, 1);
- btrfs_update_reloc_root(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ /*
+ * Even if we have an error we need this reloc root back on our
+ * list so we can clean up properly.
+ */
list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ if (!err)
+ err = ret;
+ break;
+ }
}
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
- btrfs_commit_transaction(trans);
+ err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@@ -2482,17 +1939,10 @@ again:
static noinline_for_stack
void free_reloc_roots(struct list_head *list)
{
- struct btrfs_root *reloc_root;
+ struct btrfs_root *reloc_root, *tmp;
- while (!list_empty(list)) {
- reloc_root = list_entry(list->next, struct btrfs_root,
- root_list);
+ list_for_each_entry_safe(reloc_root, tmp, list, root_list)
__del_reloc_root(reloc_root);
- free_extent_buffer(reloc_root->node);
- free_extent_buffer(reloc_root->commit_root);
- reloc_root->node = NULL;
- reloc_root->commit_root = NULL;
- }
}
static noinline_for_stack
@@ -2522,13 +1972,34 @@ again:
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- root = read_fs_root(fs_info,
- reloc_root->root_key.offset);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
-
+ if (IS_ERR(root)) {
+ /*
+ * For recovery we read the fs roots on mount,
+ * and if we didn't find the root then we marked
+ * the reloc root as a garbage root. For normal
+ * relocation obviously the root should exist in
+ * memory. However there's no reason we can't
+ * handle the error properly here just in case.
+ */
+ ASSERT(0);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ if (root->reloc_root != reloc_root) {
+ /*
+ * This is actually impossible without something
+ * going really wrong (like weird race condition
+ * or cosmic rays).
+ */
+ ASSERT(0);
+ ret = -EINVAL;
+ goto out;
+ }
ret = merge_reloc_root(rc, root);
+ btrfs_put_root(root);
if (ret) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
@@ -2536,6 +2007,16 @@ again:
goto out;
}
} else {
+ if (!IS_ERR(root)) {
+ if (root->reloc_root == reloc_root) {
+ root->reloc_root = NULL;
+ btrfs_put_root(reloc_root);
+ }
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
+ &root->state);
+ btrfs_put_root(root);
+ }
+
list_del_init(&reloc_root->root_list);
/* Don't forget to queue this reloc root for cleanup */
list_add_tail(&reloc_root->reloc_dirty_list,
@@ -2550,18 +2031,30 @@ again:
out:
if (ret) {
btrfs_handle_fs_error(fs_info, ret, NULL);
- if (!list_empty(&reloc_roots))
- free_reloc_roots(&reloc_roots);
+ free_reloc_roots(&reloc_roots);
/* new reloc root may be added */
mutex_lock(&fs_info->reloc_mutex);
list_splice_init(&rc->reloc_roots, &reloc_roots);
mutex_unlock(&fs_info->reloc_mutex);
- if (!list_empty(&reloc_roots))
- free_reloc_roots(&reloc_roots);
+ free_reloc_roots(&reloc_roots);
}
- BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ /*
+ * We used to have
+ *
+ * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ *
+ * here, but it's wrong. If we fail to start the transaction in
+ * prepare_to_merge() we will have only 0 ref reloc roots, none of which
+ * have actually been removed from the reloc_root_tree rb tree. This is
+ * fine because we're bailing here, and we hold a reference on the root
+ * for the list that holds it, so these roots will be cleaned up when we
+ * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
+ * will be cleaned up on unmount.
+ *
+ * The remaining nodes will be cleaned up by free_reloc_control.
+ */
}
static void free_block_list(struct rb_root *blocks)
@@ -2580,51 +2073,126 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = reloc_root->fs_info;
struct btrfs_root *root;
+ int ret;
if (reloc_root->last_trans == trans->transid)
return 0;
- root = read_fs_root(fs_info, reloc_root->root_key.offset);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+ root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
- return btrfs_record_root_in_trans(trans, root);
+ /*
+ * This should succeed, since we can't have a reloc root without having
+ * already looked up the actual root and created the reloc root for this
+ * root.
+ *
+ * However if there's some sort of corruption where we have a ref to a
+ * reloc root without a corresponding root this could return ENOENT.
+ */
+ if (IS_ERR(root)) {
+ ASSERT(0);
+ return PTR_ERR(root);
+ }
+ if (root->reloc_root != reloc_root) {
+ ASSERT(0);
+ btrfs_err(fs_info,
+ "root %llu has two reloc roots associated with it",
+ reloc_root->root_key.offset);
+ btrfs_put_root(root);
+ return -EUCLEAN;
+ }
+ ret = btrfs_record_root_in_trans(trans, root);
+ btrfs_put_root(root);
+
+ return ret;
}
static noinline_for_stack
struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
- struct backref_edge *edges[])
+ struct btrfs_backref_node *node,
+ struct btrfs_backref_edge *edges[])
{
- struct backref_node *next;
+ struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
+ int ret;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- BUG_ON(!root);
- BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
+
+ /*
+ * If there is no root, then our references for this block are
+ * incomplete, as we should be able to walk all the way up to a
+ * block that is owned by a root.
+ *
+ * This path is only for SHAREABLE roots, so if we come upon a
+ * non-SHAREABLE root then we have backrefs that resolve
+ * improperly.
+ *
+ * Both of these cases indicate file system corruption, or a bug
+ * in the backref walking code.
+ */
+ if (!root) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu has multiple refs with one ending in a non-shareable root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- record_reloc_root_in_trans(trans, root);
+ ret = record_reloc_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
break;
}
- btrfs_record_root_in_trans(trans, root);
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
root = root->reloc_root;
+ /*
+ * We could have raced with another thread which failed, so
+ * root->reloc_root may not be set, return ENOENT in this case.
+ */
+ if (!root)
+ return ERR_PTR(-ENOENT);
+
if (next->new_bytenr != root->node->start) {
- BUG_ON(next->new_bytenr);
- BUG_ON(!list_empty(&next->list));
+ /*
+ * We just created the reloc root, so we shouldn't have
+ * ->new_bytenr set and this shouldn't be in the changed
+ * list. If it is then we have multiple roots pointing
+ * at the same bytenr which indicates corruption, or
+ * we've made a mistake in the backref walking code.
+ */
+ ASSERT(next->new_bytenr == 0);
+ ASSERT(list_empty(&next->list));
+ if (next->new_bytenr || !list_empty(&next->list)) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+
next->new_bytenr = root->node->start;
- next->root = root;
+ btrfs_put_root(next->root);
+ next->root = btrfs_grab_root(root);
+ ASSERT(next->root);
list_add_tail(&next->list,
&rc->backref_cache.changed);
- __mark_block_processed(rc, next);
+ mark_block_processed(rc, next);
break;
}
@@ -2634,8 +2202,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!next || next->level <= node->level)
break;
}
- if (!root)
- return NULL;
+ if (!root) {
+ /*
+ * This can happen if there's fs corruption or if there's a bug
+ * in the backref lookup code.
+ */
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
@@ -2649,18 +2223,21 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
}
/*
- * select a tree root for relocation. return NULL if the block
- * is reference counted. we should use do_relocation() in this
- * case. return a tree root pointer if the block isn't reference
- * counted. return -ENOENT if the block is root of reloc tree.
+ * Select a tree root for relocation.
+ *
+ * Return NULL if the block is not shareable. We should use do_relocation() in
+ * this case.
+ *
+ * Return a tree root pointer if the block is shareable.
+ * Return -ENOENT if the block is root of reloc tree.
*/
static noinline_for_stack
-struct btrfs_root *select_one_root(struct backref_node *node)
+struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
{
- struct backref_node *next;
+ struct btrfs_backref_node *next;
struct btrfs_root *root;
struct btrfs_root *fs_root = NULL;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
int index = 0;
next = node;
@@ -2668,10 +2245,16 @@ struct btrfs_root *select_one_root(struct backref_node *node)
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- BUG_ON(!root);
- /* no other choice for non-references counted tree */
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ /*
+ * This can occur if we have incomplete extent refs leading all
+ * the way up a particular path, in this case return -EUCLEAN.
+ */
+ if (!root)
+ return ERR_PTR(-EUCLEAN);
+
+ /* No other choice for non-shareable tree */
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return root;
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2692,12 +2275,12 @@ struct btrfs_root *select_one_root(struct backref_node *node)
static noinline_for_stack
u64 calcu_metadata_size(struct reloc_control *rc,
- struct backref_node *node, int reserve)
+ struct btrfs_backref_node *node, int reserve)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct backref_node *next = node;
- struct backref_edge *edge;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_node *next = node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
u64 num_bytes = 0;
int index = 0;
@@ -2715,7 +2298,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
break;
edge = list_entry(next->upper.next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2726,7 +2309,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node)
+ struct btrfs_backref_node *node)
{
struct btrfs_root *root = rc->extent_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2744,8 +2327,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
* If we get an enospc just kick back -EAGAIN so we know to drop the
* transaction and try to refill when we can flush all the things.
*/
- ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_LIMIT);
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
while (tmp <= rc->reserved_bytes)
@@ -2774,60 +2357,58 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
*/
static int do_relocation(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
+ struct btrfs_backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct backref_node *upper;
- struct backref_edge *edge;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_node *upper;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
struct btrfs_root *root;
struct extent_buffer *eb;
u32 blocksize;
u64 bytenr;
- u64 generation;
int slot;
- int ret;
- int err = 0;
+ int ret = 0;
- BUG_ON(lowest && node->eb);
+ /*
+ * If we are lowest then this is the first time we're processing this
+ * block, and thus shouldn't have an eb associated with it yet.
+ */
+ ASSERT(!lowest || !node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
- struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
cond_resched();
upper = edge->node[UPPER];
root = select_reloc_root(trans, rc, upper, edges);
- BUG_ON(!root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto next;
+ }
if (upper->eb && !upper->locked) {
if (!lowest) {
- ret = btrfs_bin_search(upper->eb, key,
- upper->level, &slot);
- if (ret < 0) {
- err = ret;
+ ret = btrfs_bin_search(upper->eb, key, &slot);
+ if (ret < 0)
goto next;
- }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
goto next;
}
- drop_node_buffer(upper);
+ btrfs_backref_drop_node_buffer(upper);
}
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret) {
- if (ret < 0)
- err = ret;
- else
- err = -ENOENT;
+ if (ret > 0)
+ ret = -ENOENT;
btrfs_release_path(path);
break;
@@ -2846,12 +2427,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
slot = path->slots[upper->level];
btrfs_release_path(path);
} else {
- ret = btrfs_bin_search(upper->eb, key, upper->level,
- &slot);
- if (ret < 0) {
- err = ret;
+ ret = btrfs_bin_search(upper->eb, key, &slot);
+ if (ret < 0)
goto next;
- }
BUG_ON(ret);
}
@@ -2862,7 +2440,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
upper->eb->start);
- err = -EIO;
+ ret = -EIO;
goto next;
}
} else {
@@ -2871,31 +2449,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
blocksize = root->fs_info->nodesize;
- generation = btrfs_node_ptr_generation(upper->eb, slot);
- btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
- eb = read_tree_block(fs_info, bytenr, generation,
- upper->level - 1, &first_key);
+ eb = btrfs_read_node_slot(upper->eb, slot);
if (IS_ERR(eb)) {
- err = PTR_ERR(eb);
- goto next;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- err = -EIO;
+ ret = PTR_ERR(eb);
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
- BUG_ON(node->eb != eb);
+ /*
+ * We've just COWed this block, it should have updated
+ * the correct backref node entry.
+ */
+ ASSERT(node->eb == eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2906,38 +2478,44 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
upper->eb->start);
- ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&ref, node->level,
- btrfs_header_owner(upper->eb));
+ btrfs_header_owner(upper->eb),
+ root->root_key.objectid, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
-
- ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
- BUG_ON(ret);
+ if (!ret)
+ ret = btrfs_drop_subtree(trans, root, eb,
+ upper->eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
next:
if (!upper->pending)
- drop_node_buffer(upper);
+ btrfs_backref_drop_node_buffer(upper);
else
- unlock_node_buffer(upper);
- if (err)
+ btrfs_backref_unlock_node_buffer(upper);
+ if (ret)
break;
}
- if (!err && node->pending) {
- drop_node_buffer(node);
+ if (!ret && node->pending) {
+ btrfs_backref_drop_node_buffer(node);
list_move_tail(&node->list, &rc->backref_cache.changed);
node->pending = 0;
}
path->lowest_level = 0;
- BUG_ON(err == -ENOSPC);
- return err;
+
+ /*
+ * We should have allocated all of our space in the block rsv and thus
+ * shouldn't ENOSPC.
+ */
+ ASSERT(ret != -ENOSPC);
+ return ret;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
+ struct btrfs_backref_node *node,
struct btrfs_path *path)
{
struct btrfs_key key;
@@ -2951,15 +2529,15 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int err)
{
LIST_HEAD(list);
- struct backref_cache *cache = &rc->backref_cache;
- struct backref_node *node;
+ struct btrfs_backref_cache *cache = &rc->backref_cache;
+ struct btrfs_backref_node *node;
int level;
int ret;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
node = list_entry(cache->pending[level].next,
- struct backref_node, list);
+ struct btrfs_backref_node, list);
list_move_tail(&node->list, &list);
BUG_ON(!node->pending);
@@ -2974,35 +2552,16 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans,
return err;
}
-static void mark_block_processed(struct reloc_control *rc,
- u64 bytenr, u32 blocksize)
-{
- set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
- EXTENT_DIRTY);
-}
-
-static void __mark_block_processed(struct reloc_control *rc,
- struct backref_node *node)
-{
- u32 blocksize;
- if (node->level == 0 ||
- in_block_group(node->bytenr, rc->block_group)) {
- blocksize = rc->extent_root->fs_info->nodesize;
- mark_block_processed(rc, node->bytenr, blocksize);
- }
- node->processed = 1;
-}
-
/*
* mark a block and all blocks directly/indirectly reference the block
* as processed.
*/
static void update_processed_blocks(struct reloc_control *rc,
- struct backref_node *node)
+ struct btrfs_backref_node *node)
{
- struct backref_node *next = node;
- struct backref_edge *edge;
- struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ struct btrfs_backref_node *next = node;
+ struct btrfs_backref_edge *edge;
+ struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
int index = 0;
while (next) {
@@ -3011,13 +2570,13 @@ static void update_processed_blocks(struct reloc_control *rc,
if (next->processed)
break;
- __mark_block_processed(rc, next);
+ mark_block_processed(rc, next);
if (list_empty(&next->upper))
break;
edge = list_entry(next->upper.next,
- struct backref_edge, list[LOWER]);
+ struct btrfs_backref_edge, list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -3040,12 +2599,11 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- BUG_ON(block->key_ready);
- eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
- block->level, NULL);
- if (IS_ERR(eb)) {
+ eb = read_tree_block(fs_info, block->bytenr, block->owner,
+ block->key.offset, block->level, NULL);
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -3063,7 +2621,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
*/
static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct backref_node *node,
+ struct btrfs_backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path)
{
@@ -3073,32 +2631,77 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
if (!node)
return 0;
+ /*
+ * If we fail here we want to drop our backref_node because we are going
+ * to start over and regenerate the tree for it.
+ */
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
+ goto out;
+
BUG_ON(node->processed);
root = select_one_root(node);
- if (root == ERR_PTR(-ENOENT)) {
- update_processed_blocks(rc, node);
- goto out;
- }
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
- if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
- ret = reserve_metadata_space(trans, rc, node);
- if (ret)
- goto out;
+ /* See explanation in select_one_root for the -EUCLEAN case. */
+ ASSERT(ret == -ENOENT);
+ if (ret == -ENOENT) {
+ ret = 0;
+ update_processed_blocks(rc, node);
+ }
+ goto out;
}
if (root) {
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
- BUG_ON(node->new_bytenr);
- BUG_ON(!list_empty(&node->list));
- btrfs_record_root_in_trans(trans, root);
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ /*
+ * This block was the root block of a root, and this is
+ * the first time we're processing the block and thus it
+ * should not have had the ->new_bytenr modified and
+ * should have not been included on the changed list.
+ *
+ * However in the case of corruption we could have
+ * multiple refs pointing to the same block improperly,
+ * and thus we would trip over these checks. ASSERT()
+ * for the developer case, because it could indicate a
+ * bug in the backref code, however error out for a
+ * normal user in the case of corruption.
+ */
+ ASSERT(node->new_bytenr == 0);
+ ASSERT(list_empty(&node->list));
+ if (node->new_bytenr || !list_empty(&node->list)) {
+ btrfs_err(root->fs_info,
+ "bytenr %llu has improper references to it",
+ node->bytenr);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ goto out;
+ /*
+ * Another thread could have failed, need to check if we
+ * have reloc_root actually set.
+ */
+ if (!root->reloc_root) {
+ ret = -ENOENT;
+ goto out;
+ }
root = root->reloc_root;
node->new_bytenr = root->node->start;
- node->root = root;
+ btrfs_put_root(node->root);
+ node->root = btrfs_grab_root(root);
+ ASSERT(node->root);
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
}
@@ -3109,7 +2712,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
}
out:
if (ret || node->level == 0 || node->cowonly)
- remove_backref_node(&rc->backref_cache, node);
+ btrfs_backref_cleanup_node(&rc->backref_cache, node);
return ret;
}
@@ -3121,7 +2724,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct backref_node *node;
+ struct btrfs_backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
struct tree_block *next;
@@ -3137,7 +2740,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Kick in readahead for tree blocks with missing keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
- readahead_tree_block(fs_info, block->bytenr);
+ btrfs_readahead_tree_block(fs_info, block->bytenr,
+ block->owner, 0,
+ block->level);
}
/* Get first keys */
@@ -3161,9 +2766,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
- err = ret;
- goto out;
+ err = ret;
+ break;
}
}
out:
@@ -3176,66 +2780,116 @@ out_free_blocks:
return err;
}
-static noinline_for_stack
-int prealloc_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(
+ struct btrfs_inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 offset = inode->index_cnt;
u64 num_bytes;
- int nr = 0;
+ int nr;
int ret = 0;
+ u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
- u64 cur_offset;
- struct extent_changeset *data_reserved = NULL;
+ u64 cur_offset = prealloc_start;
- BUG_ON(cluster->start != cluster->boundary[0]);
- inode_lock(inode);
+ /*
+ * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
+ * This means the range [i_size, PAGE_END + 1) is filled with zeros by
+ * btrfs_do_readpage() call of previously relocated file cluster.
+ *
+ * If the current cluster starts in the above range, btrfs_do_readpage()
+ * will skip the read, and relocate_one_page() will later writeback
+ * the padding zeros as new data, causing data corruption.
+ *
+ * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ */
+ if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ struct page *page;
+
+ ASSERT(sectorsize < PAGE_SIZE);
+ ASSERT(IS_ALIGNED(i_size, sectorsize));
+
+ /*
+ * Subpage can't handle page with DIRTY but without UPTODATE
+ * bit as it can lead to the following deadlock:
+ *
+ * btrfs_read_folio()
+ * | Page already *locked*
+ * |- btrfs_lock_and_flush_ordered_range()
+ * |- btrfs_start_ordered_extent()
+ * |- extent_write_cache_pages()
+ * |- lock_page()
+ * We try to lock the page we already hold.
+ *
+ * Here we just writeback the whole data reloc inode, so that
+ * we will be ensured to have no dirty range in the page, and
+ * are safe to clear the uptodate bits.
+ *
+ * This shouldn't cause too much overhead, as we need to write
+ * the data back anyway.
+ */
+ ret = filemap_write_and_wait(mapping);
+ if (ret < 0)
+ return ret;
- ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
- prealloc_end + 1 - prealloc_start);
+ clear_extent_bits(&inode->io_tree, i_size,
+ round_up(i_size, PAGE_SIZE) - 1,
+ EXTENT_UPTODATE);
+ page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ /*
+ * If page is freed we don't need to do anything then, as we
+ * will re-read the whole page anyway.
+ */
+ if (page) {
+ btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ round_up(i_size, PAGE_SIZE) - i_size);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+
+ BUG_ON(cluster->start != cluster->boundary[0]);
+ ret = btrfs_alloc_data_chunk_ondemand(inode,
+ prealloc_end + 1 - prealloc_start);
if (ret)
- goto out;
+ return ret;
- cur_offset = prealloc_start;
- while (nr < cluster->nr) {
+ btrfs_inode_lock(&inode->vfs_inode, 0);
+ for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end, NULL);
num_bytes = end + 1 - start;
- if (cur_offset < start)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, start - cur_offset);
- ret = btrfs_prealloc_file_range(inode, 0, start,
+ ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
if (ret)
break;
- nr++;
}
+ btrfs_inode_unlock(&inode->vfs_inode, 0);
+
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, prealloc_end + 1 - cur_offset);
-out:
- inode_unlock(inode);
- extent_changeset_free(data_reserved);
+ btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
+ prealloc_end + 1 - cur_offset);
return ret;
}
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
- u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+ u64 start, u64 end, u64 block_start)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -3249,34 +2903,169 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ free_extent_map(em);
+
+ return ret;
+}
+
+/*
+ * Allow error injection to test balance/relocation cancellation
+ */
+noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ atomic_read(&fs_info->reloc_cancel_req) ||
+ fatal_signal_pending(current);
+}
+ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
+
+static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+ int cluster_nr)
+{
+ /* Last extent, use cluster end directly */
+ if (cluster_nr >= cluster->nr - 1)
+ return cluster->end;
+
+ /* Use next boundary start*/
+ return cluster->boundary[cluster_nr + 1] - 1;
+}
+
+static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+ struct file_extent_cluster *cluster,
+ int *cluster_nr, unsigned long page_index)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ struct page *page;
+ u64 page_start;
+ u64 page_end;
+ u64 cur;
+ int ret;
+
+ ASSERT(page_index <= last_index);
+ page = find_lock_page(inode->i_mapping, page_index);
+ if (!page) {
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+ page_index, last_index + 1 - page_index);
+ page = find_or_create_page(inode->i_mapping, page_index, mask);
+ if (!page)
+ return -ENOMEM;
+ }
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto release_page;
+
+ if (PageReadahead(page))
+ page_cache_async_readahead(inode->i_mapping, ra, NULL,
+ page_folio(page), page_index,
+ last_index + 1 - page_index);
+
+ if (!PageUptodate(page)) {
+ btrfs_read_folio(NULL, page_folio(page));
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto release_page;
+ }
+ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+
+ /*
+ * Start from the cluster, as for subpage case, the cluster can start
+ * inside the page.
+ */
+ cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= page_end) {
+ u64 extent_start = cluster->boundary[*cluster_nr] - offset;
+ u64 extent_end = get_cluster_boundary_end(cluster,
+ *cluster_nr) - offset;
+ u64 clamped_start = max(page_start, extent_start);
+ u64 clamped_end = min(page_end, extent_end);
+ u32 clamped_len = clamped_end + 1 - clamped_start;
+
+ /* Reserve metadata for this range */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+ clamped_len, clamped_len,
+ false);
+ if (ret)
+ goto release_page;
+
+ /* Mark the range delalloc and dirty for later writeback */
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
+ clamped_end, 0, NULL);
+ if (ret) {
+ clear_extent_bits(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ clamped_len, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ clamped_len);
+ goto release_page;
+ }
+ btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+
+ /*
+ * Set the boundary if it's inside the page.
+ * Data relocation requires the destination extents to have the
+ * same size as the source.
+ * EXTENT_BOUNDARY bit prevents current extent from being merged
+ * with previous extent.
+ */
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ page_start, PAGE_SIZE)) {
+ u64 boundary_start = cluster->boundary[*cluster_nr] -
+ offset;
+ u64 boundary_end = boundary_start +
+ fs_info->sectorsize - 1;
+
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
+ cur += clamped_len;
+
+ /* Crossed extent end, go to next extent */
+ if (cur >= extent_end) {
+ (*cluster_nr)++;
+ /* Just finished the last extent of the cluster, exit. */
+ if (*cluster_nr >= cluster->nr)
+ break;
}
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ unlock_page(page);
+ put_page(page);
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(fs_info);
+ if (btrfs_should_cancel_balance(fs_info))
+ ret = -ECANCELED;
+ return ret;
+
+release_page:
+ unlock_page(page);
+ put_page(page);
return ret;
}
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 page_start;
- u64 page_end;
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
- struct page *page;
struct file_ra_state *ra;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- int nr = 0;
+ int cluster_nr = 0;
int ret = 0;
if (!cluster->nr)
@@ -3286,107 +3075,23 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(inode, cluster);
+ ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
if (ret)
goto out;
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_extent_mapping(inode, cluster->start - offset,
+ ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out;
- index = (cluster->start - offset) >> PAGE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
- while (index <= last_index) {
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- PAGE_SIZE);
- if (ret)
- goto out;
-
- page = find_lock_page(inode->i_mapping, index);
- if (!page) {
- page_cache_sync_readahead(inode->i_mapping,
- ra, NULL, index,
- last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- mask);
- if (!page) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping,
- ra, NULL, page, index,
- last_index + 1 - index);
- }
-
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
- ret = -EIO;
- goto out;
- }
- }
-
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
-
- lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
-
- set_page_extent_mapped(page);
-
- if (nr < cluster->nr &&
- page_start + offset == cluster->boundary[nr]) {
- set_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end,
- EXTENT_BOUNDARY);
- nr++;
- }
-
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
- NULL);
- if (ret) {
- unlock_page(page);
- put_page(page);
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
-
- clear_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
- goto out;
-
- }
- set_page_dirty(page);
-
- unlock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end);
- unlock_page(page);
- put_page(page);
-
- index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- balance_dirty_pages_ratelimited(inode->i_mapping);
- btrfs_throttle(fs_info);
- }
- WARN_ON(nr != cluster->nr);
+ for (index = (cluster->start - offset) >> PAGE_SHIFT;
+ index <= last_index && !ret; index++)
+ ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
+ if (ret == 0)
+ WARN_ON(cluster_nr != cluster->nr);
out:
kfree(ra);
return ret;
@@ -3439,21 +3144,58 @@ static int add_tree_block(struct reloc_control *rc,
u32 item_size;
int level = -1;
u64 generation;
+ u64 owner = 0;
eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
item_size >= sizeof(*ei) + sizeof(*bi)) {
+ unsigned long ptr = 0, end;
+
ei = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_extent_item);
+ end = (unsigned long)ei + item_size;
if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
bi = (struct btrfs_tree_block_info *)(ei + 1);
level = btrfs_tree_block_level(eb, bi);
+ ptr = (unsigned long)(bi + 1);
} else {
level = (int)extent_key->offset;
+ ptr = (unsigned long)(ei + 1);
}
generation = btrfs_extent_generation(eb, ei);
+
+ /*
+ * We're reading random blocks without knowing their owner ahead
+ * of time. This is ok most of the time, as all reloc roots and
+ * fs roots have the same lock type. However normal trees do
+ * not, and the only way to know ahead of time is to read the
+ * inline ref offset. We know it's an fs root if
+ *
+ * 1. There's more than one ref.
+ * 2. There's a SHARED_DATA_REF_KEY set.
+ * 3. FULL_BACKREF is set on the flags.
+ *
+ * Otherwise it's safe to assume that the ref offset == the
+ * owner of this block, so we can use that when calling
+ * read_tree_block.
+ */
+ if (btrfs_extent_refs(eb, ei) == 1 &&
+ !(btrfs_extent_flags(eb, ei) &
+ BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
+ ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID)
+ return -EINVAL;
+ if (type == BTRFS_TREE_BLOCK_REF_KEY)
+ owner = btrfs_extent_inline_ref_offset(eb, iref);
+ }
} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
btrfs_print_v0_err(eb->fs_info);
btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
@@ -3475,10 +3217,12 @@ static int add_tree_block(struct reloc_control *rc,
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
+ block->owner = owner;
- rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+ rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, block->bytenr);
+ btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
+ -EEXIST);
return 0;
}
@@ -3499,7 +3243,7 @@ static int __add_tree_block(struct reloc_control *rc,
if (tree_block_processed(bytenr, rc))
return 0;
- if (tree_search(blocks, bytenr))
+ if (rb_simple_search(blocks, bytenr))
return 0;
path = btrfs_alloc_path();
@@ -3556,37 +3300,11 @@ out:
return ret;
}
-/*
- * helper to check if the block use full backrefs for pointers in it
- */
-static int block_use_full_backref(struct reloc_control *rc,
- struct extent_buffer *eb)
-{
- u64 flags;
- int ret;
-
- if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
- btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
- return 1;
-
- ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
- eb->start, btrfs_header_level(eb), 1,
- NULL, &flags);
- BUG_ON(ret);
-
- if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = 1;
- else
- ret = 0;
- return ret;
-}
-
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
- struct btrfs_key key;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int ret = 0;
@@ -3594,11 +3312,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
if (inode)
goto truncate;
- key.objectid = ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, root);
+ inode = btrfs_iget(fs_info->sb, ino, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3624,172 +3338,45 @@ out:
}
/*
- * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
- * this function scans fs tree to find blocks reference the data extent
+ * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
+ * cache inode, to avoid free space cache data extent blocking data relocation.
*/
-static int find_data_references(struct reloc_control *rc,
- struct btrfs_key *extent_key,
- struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
- struct rb_root *blocks)
+static int delete_v1_space_cache(struct extent_buffer *leaf,
+ struct btrfs_block_group *block_group,
+ u64 data_bytenr)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct btrfs_path *path;
- struct tree_block *block;
- struct btrfs_root *root;
- struct btrfs_file_extent_item *fi;
- struct rb_node *rb_node;
+ u64 space_cache_ino;
+ struct btrfs_file_extent_item *ei;
struct btrfs_key key;
- u64 ref_root;
- u64 ref_objectid;
- u64 ref_offset;
- u32 ref_count;
- u32 nritems;
- int err = 0;
- int added = 0;
- int counted;
+ bool found = false;
+ int i;
int ret;
- ref_root = btrfs_extent_data_ref_root(leaf, ref);
- ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
- ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
- ref_count = btrfs_extent_data_ref_count(leaf, ref);
-
- /*
- * This is an extent belonging to the free space cache, lets just delete
- * it and redo the search.
- */
- if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
- ret = delete_block_group_cache(fs_info, rc->block_group,
- NULL, ref_objectid);
- if (ret != -ENOENT)
- return ret;
- ret = 0;
- }
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_FORWARD;
-
- root = read_fs_root(fs_info, ref_root);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
-
- key.objectid = ref_objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- if (ref_offset > ((u64)-1 << 32))
- key.offset = 0;
- else
- key.offset = ref_offset;
-
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- /*
- * the references in tree blocks that use full backrefs
- * are not counted in
- */
- if (block_use_full_backref(rc, leaf))
- counted = 0;
- else
- counted = 1;
- rb_node = tree_search(blocks, leaf->start);
- if (rb_node) {
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
- }
-
- while (ref_count > 0) {
- while (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (WARN_ON(ret > 0))
- goto out;
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- added = 0;
+ if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
+ return 0;
- if (block_use_full_backref(rc, leaf))
- counted = 0;
- else
- counted = 1;
- rb_node = tree_search(blocks, leaf->start);
- if (rb_node) {
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
- }
- }
+ for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ u8 type;
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (WARN_ON(key.objectid != ref_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY))
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+
+ if ((type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) &&
+ btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
+ found = true;
+ space_cache_ino = key.objectid;
break;
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- goto next;
-
- if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
- extent_key->objectid)
- goto next;
-
- key.offset -= btrfs_file_extent_offset(leaf, fi);
- if (key.offset != ref_offset)
- goto next;
-
- if (counted)
- ref_count--;
- if (added)
- goto next;
-
- if (!tree_block_processed(leaf->start, rc)) {
- block = kmalloc(sizeof(*block), GFP_NOFS);
- if (!block) {
- err = -ENOMEM;
- break;
- }
- block->bytenr = leaf->start;
- btrfs_item_key_to_cpu(leaf, &block->key, 0);
- block->level = 0;
- block->key_ready = 1;
- rb_node = tree_insert(blocks, block->bytenr,
- &block->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST,
- block->bytenr);
}
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
-next:
- path->slots[0]++;
-
}
-out:
- btrfs_free_path(path);
- return err;
+ if (!found)
+ return -ENOENT;
+ ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
+ space_cache_ino);
+ return ret;
}
/*
@@ -3801,91 +3388,41 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_path *path,
struct rb_root *blocks)
{
- struct btrfs_key key;
- struct extent_buffer *eb;
- struct btrfs_extent_data_ref *dref;
- struct btrfs_extent_inline_ref *iref;
- unsigned long ptr;
- unsigned long end;
- u32 blocksize = rc->extent_root->fs_info->nodesize;
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct ulist *leaves = NULL;
+ struct ulist_iterator leaf_uiter;
+ struct ulist_node *ref_node = NULL;
+ const u32 blocksize = fs_info->nodesize;
int ret = 0;
- int err = 0;
- eb = path->nodes[0];
- ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
- ptr += sizeof(struct btrfs_extent_item);
-
- while (ptr < end) {
- iref = (struct btrfs_extent_inline_ref *)ptr;
- key.type = btrfs_get_extent_inline_ref_type(eb, iref,
- BTRFS_REF_TYPE_DATA);
- if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- key.offset = btrfs_extent_inline_ref_offset(eb, iref);
- ret = __add_tree_block(rc, key.offset, blocksize,
- blocks);
- } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = (struct btrfs_extent_data_ref *)(&iref->offset);
- ret = find_data_references(rc, extent_key,
- eb, dref, blocks);
- } else {
- ret = -EUCLEAN;
- btrfs_err(rc->extent_root->fs_info,
- "extent %llu slot %d has an invalid inline ref type",
- eb->start, path->slots[0]);
- }
- if (ret) {
- err = ret;
- goto out;
- }
- ptr += btrfs_extent_inline_ref_size(key.type);
- }
- WARN_ON(ptr > end);
+ btrfs_release_path(path);
+ ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
+ 0, &leaves, NULL, true);
+ if (ret < 0)
+ return ret;
- while (1) {
- cond_resched();
- eb = path->nodes[0];
- if (path->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(rc->extent_root, path);
- if (ret < 0) {
- err = ret;
- break;
- }
- if (ret > 0)
- break;
- eb = path->nodes[0];
- }
+ ULIST_ITER_INIT(&leaf_uiter);
+ while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+ struct extent_buffer *eb;
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
- if (key.objectid != extent_key->objectid)
+ eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
break;
-
- if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- ret = __add_tree_block(rc, key.offset, blocksize,
- blocks);
- } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_extent_data_ref);
- ret = find_data_references(rc, extent_key,
- eb, dref, blocks);
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- btrfs_print_v0_err(eb->fs_info);
- btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
- ret = -EINVAL;
- } else {
- ret = 0;
}
- if (ret) {
- err = ret;
+ ret = delete_v1_space_cache(eb, rc->block_group,
+ extent_key->objectid);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ break;
+ ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
+ if (ret < 0)
break;
- }
- path->slots[0]++;
}
-out:
- btrfs_release_path(path);
- if (err)
+ if (ret < 0)
free_block_list(blocks);
- return err;
+ ulist_free(leaves);
+ return ret;
}
/*
@@ -3992,20 +3529,6 @@ static void unset_reloc_control(struct reloc_control *rc)
mutex_unlock(&fs_info->reloc_mutex);
}
-static int check_extent_flags(u64 flags)
-{
- if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
- return 1;
- if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
- !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
- return 1;
- if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- return 1;
- return 0;
-}
-
static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
@@ -4025,7 +3548,7 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
RELOCATION_RESERVED_NODES;
- ret = btrfs_block_rsv_refill(rc->extent_root,
+ ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
rc->block_rsv, rc->block_rsv->size,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
@@ -4044,8 +3567,12 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- btrfs_commit_transaction(trans);
- return 0;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ unset_reloc_control(rc);
+
+ return ret;
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -4057,7 +3584,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct btrfs_path *path;
struct btrfs_extent_item *ei;
u64 flags;
- u32 item_size;
int ret;
int err = 0;
int progress = 0;
@@ -4075,9 +3601,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
while (1) {
rc->reserved_bytes = 0;
- ret = btrfs_block_rsv_refill(rc->extent_root,
- rc->block_rsv, rc->block_rsv->size,
- BTRFS_RESERVE_FLUSH_ALL);
+ ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
+ rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
err = ret;
break;
@@ -4106,19 +3632,7 @@ restart:
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- flags = btrfs_extent_flags(path->nodes[0], ei);
- ret = check_extent_flags(flags);
- BUG_ON(ret);
- } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
- err = -EINVAL;
- btrfs_print_v0_err(trans->fs_info);
- btrfs_abort_transaction(trans, err);
- break;
- } else {
- BUG();
- }
+ flags = btrfs_extent_flags(path->nodes[0], ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
@@ -4137,12 +3651,6 @@ restart:
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
- /*
- * if we fail to relocate tree blocks, force to update
- * backref cache when committing transaction.
- */
- rc->backref_cache.last_trans = trans->transid - 1;
-
if (ret != -EAGAIN) {
err = ret;
break;
@@ -4166,6 +3674,10 @@ restart:
break;
}
}
+ if (btrfs_should_cancel_balance(fs_info)) {
+ err = -ECANCELED;
+ break;
+ }
}
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
@@ -4194,16 +3706,24 @@ restart:
rc->create_reloc_tree = 0;
set_reloc_control(rc);
- backref_cache_cleanup(&rc->backref_cache);
- btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
+ btrfs_backref_release_cache(&rc->backref_cache);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
+ /*
+ * Even in the case when the relocation is cancelled, we should all go
+ * through prepare_to_merge() and merge_reloc_roots().
+ *
+ * For error (including cancelled balance), prepare_to_merge() will
+ * mark all reloc trees orphan, then queue them for cleanup in
+ * merge_reloc_roots()
+ */
err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
- btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
@@ -4211,11 +3731,13 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret && !err)
+ err = ret;
+out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
-out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
return err;
@@ -4251,6 +3773,35 @@ out:
return ret;
}
+static void delete_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, root, path);
+out:
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ btrfs_free_path(path);
+}
+
/*
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
@@ -4262,44 +3813,86 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
- struct btrfs_key key;
u64 objectid;
int err = 0;
- root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (IS_ERR(root))
- return ERR_CAST(root);
-
+ root = btrfs_grab_root(fs_info->data_reloc_root);
trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ btrfs_put_root(root);
return ERR_CAST(trans);
+ }
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out;
err = __insert_orphan_inode(trans, root, objectid);
- BUG_ON(err);
+ if (err)
+ goto out;
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root);
- BUG_ON(IS_ERR(inode));
+ inode = btrfs_iget(fs_info->sb, objectid, root);
+ if (IS_ERR(inode)) {
+ delete_orphan_inode(trans, root, objectid);
+ err = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
+ btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
- if (inode)
- iput(inode);
+ iput(inode);
inode = ERR_PTR(err);
}
return inode;
}
+/*
+ * Mark start of chunk relocation that is cancellable. Check if the cancellation
+ * has been requested meanwhile and don't start in that case.
+ *
+ * Return:
+ * 0 success
+ * -EINPROGRESS operation is already in progress, that's probably a bug
+ * -ECANCELED cancellation request was set before the operation started
+ */
+static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
+{
+ if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+ /* This should not happen */
+ btrfs_err(fs_info, "reloc already running, cannot start");
+ return -EINPROGRESS;
+ }
+
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
+ btrfs_info(fs_info, "chunk relocation canceled on start");
+ /*
+ * On cancel, clear all requests but let the caller mark
+ * the end after cleanup operations.
+ */
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+ return -ECANCELED;
+ }
+ return 0;
+}
+
+/*
+ * Mark end of chunk relocation that is cancellable and wake any waiters.
+ */
+static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
+{
+ /* Requested after start, clear bit first so any waiters can continue */
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0)
+ btrfs_info(fs_info, "chunk relocation canceled during operation");
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+}
+
static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
@@ -4310,13 +3903,25 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
- backref_cache_init(&rc->backref_cache);
+ btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(fs_info, &rc->processed_blocks,
IO_TREE_RELOC_BLOCKS, NULL);
return rc;
}
+static void free_reloc_control(struct reloc_control *rc)
+{
+ struct mapping_node *node, *tmp;
+
+ free_reloc_roots(&rc->reloc_roots);
+ rbtree_postorder_for_each_entry_safe(node, tmp,
+ &rc->reloc_root_tree.rb_root, rb_node)
+ kfree(node);
+
+ kfree(rc);
+}
+
/*
* Print the block group being relocated
*/
@@ -4347,7 +3952,7 @@ static const char *stage_to_string(int stage)
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
struct btrfs_block_group *bg;
- struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
struct reloc_control *rc;
struct inode *inode;
struct btrfs_path *path;
@@ -4355,10 +3960,34 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
+ /*
+ * This only gets set if we had a half-deleted snapshot on mount. We
+ * cannot allow relocation to start while we're still trying to clean up
+ * these pending deletions.
+ */
+ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
+ if (ret)
+ return ret;
+
+ /* We may have been woken up by close_ctree, so bail if we're closing. */
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
+ /*
+ * Relocation of a data block group creates ordered extents. Without
+ * sb_start_write(), we can freeze the filesystem while unfinished
+ * ordered extents are left. Such ordered extents can cause a deadlock
+ * e.g. when syncfs() is waiting for their completion but they can't
+ * finish because they block when joining a transaction, due to the
+ * fact that the freeze locks are being held in write mode.
+ */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ ASSERT(sb_write_started(fs_info->sb));
+
if (btrfs_pinned_by_swapfile(fs_info, bg)) {
btrfs_put_block_group(bg);
return -ETXTBSY;
@@ -4370,6 +3999,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
return -ENOMEM;
}
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_put_bg;
+ }
+
rc->extent_root = extent_root;
rc->block_group = bg;
@@ -4414,6 +4049,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->start,
rc->block_group->length);
+ ret = btrfs_zone_finish(rc->block_group);
+ WARN_ON(ret && ret != -EAGAIN);
+
while (1) {
int finishes_stage;
@@ -4460,8 +4098,10 @@ out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
- btrfs_put_block_group(rc->block_group);
- kfree(rc);
+out_put_bg:
+ btrfs_put_block_group(bg);
+ reloc_chunk_end(fs_info);
+ free_reloc_control(rc);
return err;
}
@@ -4477,7 +4117,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
- root->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&root->root_item, 0);
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
@@ -4494,9 +4134,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
* this function resumes merging reloc trees with corresponding fs trees.
* this is important for keeping the sharing of tree blocks
*/
-int btrfs_recover_relocation(struct btrfs_root *root)
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(reloc_roots);
struct btrfs_key key;
struct btrfs_root *fs_root;
@@ -4537,17 +4176,18 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_fs_root(root, &key);
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
}
+ set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
list_add(&reloc_root->root_list, &reloc_roots);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- fs_root = read_fs_root(fs_info,
- reloc_root->root_key.offset);
+ fs_root = btrfs_get_fs_root(fs_info,
+ reloc_root->root_key.offset, false);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
if (ret != -ENOENT) {
@@ -4559,6 +4199,8 @@ int btrfs_recover_relocation(struct btrfs_root *root)
err = ret;
goto out;
}
+ } else {
+ btrfs_put_root(fs_root);
}
}
@@ -4578,15 +4220,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out;
}
- rc->extent_root = fs_info->extent_root;
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_end;
+ }
+
+ rc->extent_root = btrfs_extent_root(fs_info, 0);
set_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- unset_reloc_control(rc);
err = PTR_ERR(trans);
- goto out_free;
+ goto out_unset;
}
rc->merge_reloc_tree = 1;
@@ -4602,21 +4249,30 @@ int btrfs_recover_relocation(struct btrfs_root *root)
continue;
}
- fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
+ fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
+ false);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
- goto out_free;
+ btrfs_end_transaction(trans);
+ goto out_unset;
}
err = __add_reloc_root(reloc_root);
- BUG_ON(err < 0); /* -ENOMEM or logic error */
- fs_root->reloc_root = reloc_root;
+ ASSERT(err != -EEXIST);
+ if (err) {
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(fs_root);
+ btrfs_end_transaction(trans);
+ goto out_unset;
+ }
+ fs_root->reloc_root = btrfs_grab_root(reloc_root);
+ btrfs_put_root(fs_root);
}
err = btrfs_commit_transaction(trans);
if (err)
- goto out_free;
+ goto out_unset;
merge_reloc_roots(rc);
@@ -4625,28 +4281,29 @@ int btrfs_recover_relocation(struct btrfs_root *root)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- goto out_free;
+ goto out_clean;
}
err = btrfs_commit_transaction(trans);
-
+out_clean:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
-out_free:
- kfree(rc);
+out_unset:
+ unset_reloc_control(rc);
+out_end:
+ reloc_chunk_end(fs_info);
+ free_reloc_control(rc);
out:
- if (!list_empty(&reloc_roots))
- free_reloc_roots(&reloc_roots);
+ free_reloc_roots(&reloc_roots);
btrfs_free_path(path);
if (err == 0) {
/* cleanup orphan inode in data relocation tree */
- fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (IS_ERR(fs_root))
- err = PTR_ERR(fs_root);
- else
- err = btrfs_orphan_cleanup(fs_root);
+ fs_root = btrfs_grab_root(fs_info->data_reloc_root);
+ ASSERT(fs_root);
+ err = btrfs_orphan_cleanup(fs_root);
+ btrfs_put_root(fs_root);
}
return err;
}
@@ -4657,9 +4314,10 @@ out:
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_root *csum_root;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
@@ -4670,9 +4328,10 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
- ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list, 0);
+ disk_bytenr = file_pos + inode->index_cnt;
+ csum_root = btrfs_csum_root(fs_info, disk_bytenr);
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + len - 1, &list, 0, false);
if (ret)
goto out;
@@ -4708,7 +4367,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct reloc_control *rc;
- struct backref_node *node;
+ struct btrfs_backref_node *node;
int first_cow = 0;
int level;
int ret = 0;
@@ -4717,13 +4376,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
if (!rc)
return 0;
- BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
- root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
-
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (buf == root->node)
- __update_reloc_root(root, cow->start);
- }
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
@@ -4738,7 +4391,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(node->bytenr != buf->start &&
node->new_bytenr != buf->start);
- drop_node_buffer(node);
+ btrfs_backref_drop_node_buffer(node);
atomic_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
@@ -4750,7 +4403,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
}
if (first_cow)
- __mark_block_processed(rc, node);
+ mark_block_processed(rc, node);
if (first_cow && level > 0)
rc->nodes_relocated += buf->len;
@@ -4795,6 +4448,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
/*
* called after snapshot is created. migrate block reservation
* and create reloc root for the newly created snapshot
+ *
+ * This is similar to btrfs_init_reloc_root(), we come out of here with two
+ * references held on the reloc_root, one for root->reloc_root and one for
+ * rc->reloc_roots.
*/
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
@@ -4826,8 +4483,13 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
- BUG_ON(ret < 0);
- new_root->reloc_root = reloc_root;
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
+ new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 612411c74550..e1f599d7a916 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,11 +22,10 @@
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
- uuid_le uuid;
u32 len;
int need_reset = 0;
- len = btrfs_item_size_nr(eb, slot);
+ len = btrfs_item_size(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
@@ -40,12 +39,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
need_reset = 1;
}
if (need_reset) {
- memset(&item->generation_v2, 0,
- sizeof(*item) - offsetof(struct btrfs_root_item,
- generation_v2));
-
- uuid_le_gen(&uuid);
- memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+ /* Clear all members from generation_v2 onwards. */
+ memset_startat(item, 0, generation_v2);
+ generate_random_guid(item->uuid);
}
}
@@ -150,7 +146,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
- old_len = btrfs_item_size_nr(l, slot);
+ old_len = btrfs_item_size(l, slot);
/*
* If this is the first time we update the root item which originated
@@ -212,7 +208,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_key root_key;
struct btrfs_root *root;
int err = 0;
int ret;
@@ -225,10 +220,9 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
-
while (1) {
+ u64 root_objectid;
+
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
err = ret;
@@ -252,28 +246,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- root_key.objectid = key.offset;
+ root_objectid = key.offset;
key.offset++;
- /*
- * The root might have been inserted already, as before we look
- * for orphan roots, log replay might have happened, which
- * triggers a transaction commit and qgroup accounting, which
- * in turn reads and inserts fs roots while doing backref
- * walking.
- */
- root = btrfs_lookup_fs_root(fs_info, root_key.objectid);
- if (root) {
- WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
- &root->state));
- if (btrfs_root_refs(&root->root_item) == 0) {
- set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
- btrfs_add_dead_root(root);
- }
- continue;
- }
-
- root = btrfs_read_fs_root(tree_root, &root_key);
+ root = btrfs_get_fs_root(fs_info, root_objectid, false);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
@@ -290,7 +266,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
break;
}
err = btrfs_del_orphan_item(trans, tree_root,
- root_key.objectid);
+ root_objectid);
btrfs_end_transaction(trans);
if (err) {
btrfs_handle_fs_error(fs_info, err,
@@ -300,25 +276,27 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
continue;
}
- err = btrfs_init_fs_root(root);
- if (err) {
- btrfs_free_fs_root(root);
- break;
- }
-
- set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
-
- err = btrfs_insert_fs_root(fs_info, root);
- if (err) {
- BUG_ON(err == -EEXIST);
- btrfs_free_fs_root(root);
- break;
- }
-
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
+ struct btrfs_key drop_key;
+
+ btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
+ /*
+ * If we have a non-zero drop_progress then we know we
+ * made it partly through deleting this snapshot, and
+ * thus we need to make sure we block any balance from
+ * happening until this snapshot is completely dropped.
+ */
+ if (drop_key.objectid != 0 || drop_key.type != 0 ||
+ drop_key.offset != 0) {
+ set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+ set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+ }
+
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
+ btrfs_put_root(root);
}
btrfs_free_path(path);
@@ -359,7 +337,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
- int err = 0;
int ret;
path = btrfs_alloc_path();
@@ -371,8 +348,9 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
@@ -380,18 +358,18 @@ again:
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name_len) ||
memcmp_extent_buffer(leaf, name, ptr, name_len)) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out;
}
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
- } else
- err = -ENOENT;
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -403,7 +381,7 @@ again:
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
/*
@@ -530,7 +508,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
- qgroup_num_bytes, true);
+ qgroup_num_bytes, true,
+ false);
if (ret)
return ret;
}
@@ -538,7 +517,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
@@ -547,11 +526,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 61b37c56a7fb..196c4c6ed1ed 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -20,6 +20,7 @@
#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
+#include "zoned.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -38,44 +39,39 @@ struct scrub_block;
struct scrub_ctx;
/*
- * the following three values only influence the performance.
+ * The following three values only influence the performance.
+ *
* The last one configures the number of parallel and outstanding I/O
- * operations. The first two values configure an upper limit for the number
+ * operations. The first one configures an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
-#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
-#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
+#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
/*
- * the following value times PAGE_SIZE needs to be large enough to match the
+ * The following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
- * Values larger than BTRFS_STRIPE_LEN are not supported.
*/
-#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
+#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+
+#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
struct scrub_recover {
refcount_t refs;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 map_length;
};
-struct scrub_page {
+struct scrub_sector {
struct scrub_block *sblock;
- struct page *page;
- struct btrfs_device *dev;
struct list_head list;
u64 flags; /* extent flags */
u64 generation;
- u64 logical;
- u64 physical;
- u64 physical_for_dev_replace;
+ /* Offset in bytes to @sblock. */
+ u32 offset;
atomic_t refs;
- struct {
- unsigned int mirror_num:8;
- unsigned int have_csum:1;
- unsigned int io_error:1;
- };
+ unsigned int have_csum:1;
+ unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
@@ -89,20 +85,30 @@ struct scrub_bio {
blk_status_t status;
u64 logical;
u64 physical;
-#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
- struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
-#else
- struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
-#endif
- int page_count;
+ struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
+ int sector_count;
int next_free;
- struct btrfs_work work;
+ struct work_struct work;
};
struct scrub_block {
- struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
- int page_count;
- atomic_t outstanding_pages;
+ /*
+ * Each page will have its page::private used to record the logical
+ * bytenr.
+ */
+ struct page *pages[SCRUB_MAX_PAGES];
+ struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ struct btrfs_device *dev;
+ /* Logical bytenr of the sblock */
+ u64 logical;
+ u64 physical;
+ u64 physical_for_dev_replace;
+ /* Length of sblock in bytes */
+ u32 len;
+ int sector_count;
+ int mirror_num;
+
+ atomic_t outstanding_sectors;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
@@ -116,7 +122,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
- struct btrfs_work work;
+ struct work_struct work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -131,25 +137,23 @@ struct scrub_parity {
int nsectors;
- u64 stripe_len;
+ u32 stripe_len;
refcount_t refs;
- struct list_head spages;
+ struct list_head sectors_list;
/* Work of parity check and repair */
- struct btrfs_work work;
+ struct work_struct work;
/* Mark the parity blocks which have data */
- unsigned long *dbitmap;
+ unsigned long dbitmap;
/*
* Mark the parity blocks which have data, but errors happen when
* read data or check data
*/
- unsigned long *ebitmap;
-
- unsigned long bitmap[0];
+ unsigned long ebitmap;
};
struct scrub_ctx {
@@ -161,17 +165,20 @@ struct scrub_ctx {
atomic_t workers_pending;
spinlock_t list_lock;
wait_queue_head_t list_wait;
- u16 csum_size;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_rd_bio;
+ int sectors_per_bio;
+
+ /* State of IO submission throttling affecting the associated device */
+ ktime_t throttle_deadline;
+ u64 throttle_sent;
int is_dev_replace;
+ u64 write_pointer;
struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
- int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
struct btrfs_device *wr_tgtdev;
bool flush_all_writes;
@@ -207,59 +214,217 @@ struct full_stripe_lock {
struct mutex mutex;
};
-static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
-static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+#ifndef CONFIG_64BIT
+/* This structure is for archtectures whose (void *) is smaller than u64 */
+struct scrub_page_private {
+ u64 logical;
+};
+#endif
+
+static int attach_scrub_page_private(struct page *page, u64 logical)
+{
+#ifdef CONFIG_64BIT
+ attach_page_private(page, (void *)logical);
+ return 0;
+#else
+ struct scrub_page_private *spp;
+
+ spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+ if (!spp)
+ return -ENOMEM;
+ spp->logical = logical;
+ attach_page_private(page, (void *)spp);
+ return 0;
+#endif
+}
+
+static void detach_scrub_page_private(struct page *page)
+{
+#ifdef CONFIG_64BIT
+ detach_page_private(page);
+ return;
+#else
+ struct scrub_page_private *spp;
+
+ spp = detach_page_private(page);
+ kfree(spp);
+ return;
+#endif
+}
+
+static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+ struct btrfs_device *dev,
+ u64 logical, u64 physical,
+ u64 physical_for_dev_replace,
+ int mirror_num)
+{
+ struct scrub_block *sblock;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ if (!sblock)
+ return NULL;
+ refcount_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->logical = logical;
+ sblock->physical = physical;
+ sblock->physical_for_dev_replace = physical_for_dev_replace;
+ sblock->dev = dev;
+ sblock->mirror_num = mirror_num;
+ sblock->no_io_error_seen = 1;
+ /*
+ * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+ * the corresponding page is not allocated.
+ */
+ return sblock;
+}
+
+/*
+ * Allocate a new scrub sector and attach it to @sblock.
+ *
+ * Will also allocate new pages for @sblock if needed.
+ */
+static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+ u64 logical, gfp_t gfp)
+{
+ const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+ struct scrub_sector *ssector;
+
+ /* We must never have scrub_block exceed U32_MAX in size. */
+ ASSERT(logical - sblock->logical < U32_MAX);
+
+ ssector = kzalloc(sizeof(*ssector), gfp);
+ if (!ssector)
+ return NULL;
+
+ /* Allocate a new page if the slot is not allocated */
+ if (!sblock->pages[page_index]) {
+ int ret;
+
+ sblock->pages[page_index] = alloc_page(gfp);
+ if (!sblock->pages[page_index]) {
+ kfree(ssector);
+ return NULL;
+ }
+ ret = attach_scrub_page_private(sblock->pages[page_index],
+ sblock->logical + (page_index << PAGE_SHIFT));
+ if (ret < 0) {
+ kfree(ssector);
+ __free_page(sblock->pages[page_index]);
+ sblock->pages[page_index] = NULL;
+ return NULL;
+ }
+ }
+
+ atomic_set(&ssector->refs, 1);
+ ssector->sblock = sblock;
+ /* The sector to be added should not be used */
+ ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+ ssector->offset = logical - sblock->logical;
+
+ /* The sector count must be smaller than the limit */
+ ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+ sblock->sectors[sblock->sector_count] = ssector;
+ sblock->sector_count++;
+ sblock->len += sblock->sctx->fs_info->sectorsize;
+
+ return ssector;
+}
+
+static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+ pgoff_t index;
+ /*
+ * When calling this function, ssector must be alreaday attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ index = ssector->offset >> PAGE_SHIFT;
+ ASSERT(index < SCRUB_MAX_PAGES);
+ ASSERT(sblock->pages[index]);
+ ASSERT(PagePrivate(sblock->pages[index]));
+ return sblock->pages[index];
+}
+
+static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+
+ /*
+ * When calling this function, ssector must be already attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ return offset_in_page(ssector->offset);
+}
+
+static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+{
+ return page_address(scrub_sector_get_page(ssector)) +
+ scrub_sector_get_page_offset(ssector);
+}
+
+static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+ unsigned int len)
+{
+ return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+ scrub_sector_get_page_offset(ssector));
+}
+
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck);
+ struct scrub_block *sblocks_for_recheck[]);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
- int page_num, int force_write);
+ int sector_num, int force_write);
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num);
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
+ int sector_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
-static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
-static void scrub_page_get(struct scrub_page *spage);
-static void scrub_page_put(struct scrub_page *spage);
+static void scrub_sector_get(struct scrub_sector *sector);
+static void scrub_sector_put(struct scrub_sector *sector);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum, int force,
- u64 physical_for_dev_replace);
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum,
+ u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
-static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_bio_end_io_worker(struct work_struct *work);
static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u64 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num);
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector);
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
-static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_wr_bio_end_io_worker(struct work_struct *work);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
{
- return page->recover &&
- (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ return sector->recover &&
+ (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -546,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
- for (i = 0; i < sbio->page_count; i++) {
- WARN_ON(!sbio->pagev[i]->page);
- scrub_block_put(sbio->pagev[i]->sblock);
- }
+ for (i = 0; i < sbio->sector_count; i++)
+ scrub_block_put(sbio->sectors[i]->sblock);
bio_put(sbio->bio);
}
@@ -583,7 +746,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+ sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
@@ -597,9 +760,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
- sbio->page_count = 0;
- btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
- NULL);
+ sbio->sector_count = 0;
+ INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -610,18 +772,17 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
- sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
+ sctx->throttle_deadline = 0;
WARN_ON(sctx->wr_curr_bio != NULL);
mutex_init(&sctx->wr_lock);
sctx->wr_curr_bio = NULL;
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);
- sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}
@@ -636,7 +797,6 @@ nomem:
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
{
- u64 isize;
u32 nlink;
int ret;
int i;
@@ -647,13 +807,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
- struct btrfs_key root_key;
struct btrfs_key key;
- root_key.objectid = root;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ local_root = btrfs_get_fs_root(fs_info, root, true);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
@@ -668,6 +824,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
+ btrfs_put_root(local_root);
btrfs_release_path(swarn->path);
goto err;
}
@@ -675,7 +832,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
- isize = btrfs_inode_size(eb, inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
@@ -688,6 +844,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
ipath = init_ipath(4096, local_root, swarn->path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
+ btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
@@ -703,14 +860,15 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
swarn->physical,
root, inum, offset,
- min(isize - offset, (u64)PAGE_SIZE), nlink,
+ fs_info->sectorsize, nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
+ btrfs_put_root(local_root);
free_ipath(ipath);
return 0;
@@ -743,16 +901,23 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u8 ref_level = 0;
int ret;
- WARN_ON(sblock->page_count < 1);
- dev = sblock->pagev[0]->dev;
+ WARN_ON(sblock->sector_count < 1);
+ dev = sblock->dev;
fs_info = sblock->sctx->fs_info;
+ /* Super block error, no need to search extent tree. */
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ errstr, rcu_str_deref(dev->name),
+ sblock->physical);
+ return;
+ }
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->pagev[0]->physical;
- swarn.logical = sblock->pagev[0]->logical;
+ swarn.physical = sblock->physical;
+ swarn.logical = sblock->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -766,7 +931,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ item_size = btrfs_item_size(eb, path->slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
@@ -806,15 +971,15 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(recover->bbio);
+ btrfs_put_bioc(recover->bioc);
kfree(recover);
}
}
/*
* scrub_handle_errored_block gets called when either verification of the
- * pages failed or the bio failed to read, e.g. with EIO. In the latter
- * case, this function handles all pages in the bio, even though only one
+ * sectors failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all sectors in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
@@ -822,43 +987,48 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
- struct btrfs_device *dev;
+ struct btrfs_device *dev = sblock_to_check->dev;
struct btrfs_fs_info *fs_info;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ /* One scrub_block for each mirror */
+ struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
- int page_num;
+ int sector_num;
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- BUG_ON(sblock_to_check->page_count < 1);
+ BUG_ON(sblock_to_check->sector_count < 1);
fs_info = sctx->fs_info;
- if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
- * if we find an error in a super block, we just report it.
+ * If we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
+ scrub_print_warning("super block error", sblock_to_check);
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
return 0;
}
- logical = sblock_to_check->pagev[0]->logical;
- BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
- failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
- is_metadata = !(sblock_to_check->pagev[0]->flags &
+ logical = sblock_to_check->logical;
+ ASSERT(sblock_to_check->mirror_num);
+ failed_mirror_index = sblock_to_check->mirror_num - 1;
+ is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->pagev[0]->have_csum;
- dev = sblock_to_check->pagev[0]->dev;
+ have_csum = sblock_to_check->sectors[0]->have_csum;
+
+ if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
+ return 0;
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
@@ -866,7 +1036,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* might be waiting the scrub task to pause (which needs to wait for all
* the worker tasks to complete before pausing).
* We do allocations in the workers through insert_full_stripe_lock()
- * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+ * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
* this function.
*/
nofs_flag = memalloc_nofs_save();
@@ -893,44 +1063,55 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
- * page by page this time in order to know which pages
+ * sector by sector this time in order to know which sectors
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
- * pages from those mirrors without I/O error on the
- * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
- * would be that mirror #1 has an I/O error on the first page,
- * the second page is good, and mirror #2 has an I/O error on
- * the second page, but the first page is good.
- * Then the first page of the first mirror can be repaired by
- * taking the first page of the second mirror, and the
- * second page of the second mirror can be repaired by
- * copying the contents of the 2nd page of the 1st mirror.
- * One more note: if the pages of one mirror contain I/O
+ * sectors from those mirrors without I/O error on the
+ * particular sectors. One example (with blocks >= 2 * sectorsize)
+ * would be that mirror #1 has an I/O error on the first sector,
+ * the second sector is good, and mirror #2 has an I/O error on
+ * the second sector, but the first sector is good.
+ * Then the first sector of the first mirror can be repaired by
+ * taking the first sector of the second mirror, and the
+ * second sector of the second mirror can be repaired by
+ * copying the contents of the 2nd sector of the 1st mirror.
+ * One more note: if the sectors of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
- * Only if this is not possible, the pages are picked from
+ * Only if this is not possible, the sectors are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
* the statistics.
*/
-
- sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_KERNEL);
- if (!sblocks_for_recheck) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- goto out;
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ /*
+ * Note: the two members refs and outstanding_sectors are not
+ * used in the blocks that are used for the recheck procedure.
+ *
+ * But alloc_scrub_block() will initialize sblock::ref anyway,
+ * so we can use scrub_block_put() to clean them up.
+ *
+ * And here we don't setup the physical/dev for the sblock yet,
+ * they will be correctly initialized in scrub_setup_recheck_block().
+ */
+ sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+ logical, 0, 0, mirror_index);
+ if (!sblocks_for_recheck[mirror_index]) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
}
- /* setup the context, map the logical blocks and alloc the pages */
+ /* Setup the context, map the logical blocks and alloc the sectors */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
@@ -941,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
- sblock_bad = sblocks_for_recheck + failed_mirror_index;
+ sblock_bad = sblocks_for_recheck[failed_mirror_index];
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, 1);
@@ -949,7 +1130,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
- * the error disappeared after reading page by page, or
+ * The error disappeared after reading sector by sector, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
@@ -970,14 +1151,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -985,7 +1166,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
@@ -1010,10 +1191,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
- * checksum is present, only those pages are rewritten that had
+ * checksum is present, only those sectors are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
- * determined, which copy of the other pages is better (and it
- * could happen otherwise that a correct page would be
+ * determined, which copy of the other sectors is better (and it
+ * could happen otherwise that a correct sector would be
* overwritten by a bad one).
*/
for (mirror_index = 0; ;mirror_index++) {
@@ -1023,26 +1204,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
continue;
/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
- if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
- if (!sblocks_for_recheck[mirror_index].page_count)
+ if (!sblocks_for_recheck[mirror_index]->sector_count)
break;
- sblock_other = sblocks_for_recheck + mirror_index;
+ sblock_other = sblocks_for_recheck[mirror_index];
} else {
- struct scrub_recover *r = sblock_bad->pagev[0]->recover;
- int max_allowed = r->bbio->num_stripes -
- r->bbio->num_tgtdevs;
+ struct scrub_recover *r = sblock_bad->sectors[0]->recover;
+ int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
- if (!sblocks_for_recheck[1].page_count)
+ if (!sblocks_for_recheck[1]->sector_count)
break;
ASSERT(failed_mirror_index == 0);
- sblock_other = sblocks_for_recheck + 1;
- sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+ sblock_other = sblocks_for_recheck[1];
+ sblock_other->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
@@ -1068,39 +1248,39 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/*
* In case of I/O errors in the area that is supposed to be
- * repaired, continue by picking good copies of those pages.
- * Select the good pages from mirrors to rewrite bad pages from
+ * repaired, continue by picking good copies of those sectors.
+ * Select the good sectors from mirrors to rewrite bad sectors from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
- * all possible combinations of pages from the different mirrors
+ * all possible combinations of sectors from the different mirrors
* until the checksum verification succeeds. For example, when
- * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
* of mirror #2 is readable but the final checksum test fails,
- * then the 2nd page of mirror #3 could be tried, whether now
+ * then the 2nd sector of mirror #3 could be tried, whether now
* the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
- * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * disks) instead of on sectorsize. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
- * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * mirror, even if other 512 byte sectors in the same sectorsize
* area are unreadable.
*/
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ for (sector_num = 0; sector_num < sblock_bad->sector_count;
+ sector_num++) {
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
struct scrub_block *sblock_other = NULL;
- /* skip no-io-error page in scrub */
- if (!page_bad->io_error && !sctx->is_dev_replace)
+ /* Skip no-io-error sectors in scrub */
+ if (!sector_bad->io_error && !sctx->is_dev_replace)
continue;
- if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
/*
* In case of dev replace, if raid56 rebuild process
* didn't work out correct data, then copy the content
@@ -1109,16 +1289,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
- } else if (page_bad->io_error) {
- /* try to find no-io-error page in mirrors */
+ } else if (sector_bad->io_error) {
+ /* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
+ sblocks_for_recheck[mirror_index]->sector_count > 0;
mirror_index++) {
- if (!sblocks_for_recheck[mirror_index].
- pagev[page_num]->io_error) {
- sblock_other = sblocks_for_recheck +
- mirror_index;
+ if (!sblocks_for_recheck[mirror_index]->
+ sectors[sector_num]->io_error) {
+ sblock_other = sblocks_for_recheck[mirror_index];
break;
}
}
@@ -1128,27 +1307,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (sctx->is_dev_replace) {
/*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
+ * Did not find a mirror to fetch the sector from.
+ * scrub_write_sector_to_dev_replace() handles this
+ * case (sector->io_error), by filling the block with
+ * zeros before submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
- if (scrub_write_page_to_dev_replace(sblock_other,
- page_num) != 0) {
+ if (scrub_write_sector_to_dev_replace(sblock_other,
+ sector_num) != 0) {
atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
} else if (sblock_other) {
- ret = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_other,
- page_num, 0);
+ ret = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_other,
+ sector_num, 0);
if (0 == ret)
- page_bad->io_error = 0;
+ sector_bad->io_error = 0;
else
success = 0;
}
@@ -1193,27 +1371,28 @@ did_not_correct_error:
}
out:
- if (sblocks_for_recheck) {
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
- mirror_index++) {
- struct scrub_block *sblock = sblocks_for_recheck +
- mirror_index;
- struct scrub_recover *recover;
- int page_index;
-
- for (page_index = 0; page_index < sblock->page_count;
- page_index++) {
- sblock->pagev[page_index]->sblock = NULL;
- recover = sblock->pagev[page_index]->recover;
- if (recover) {
- scrub_put_recover(fs_info, recover);
- sblock->pagev[page_index]->recover =
- NULL;
- }
- scrub_page_put(sblock->pagev[page_index]);
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+ struct scrub_recover *recover;
+ int sector_index;
+
+ /* Not allocated, continue checking the next mirror */
+ if (!sblock)
+ continue;
+
+ for (sector_index = 0; sector_index < sblock->sector_count;
+ sector_index++) {
+ /*
+ * Here we just cleanup the recover, each sector will be
+ * properly cleaned up by later scrub_block_put()
+ */
+ recover = sblock->sectors[sector_index]->recover;
+ if (recover) {
+ scrub_put_recover(fs_info, recover);
+ sblock->sectors[sector_index]->recover = NULL;
}
}
- kfree(sblocks_for_recheck);
+ scrub_block_put(sblock);
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
@@ -1223,19 +1402,18 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
- if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
- else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
- return (int)bbio->num_stripes;
+ return (int)bioc->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
u64 *raid_map,
- u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
@@ -1250,7 +1428,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
continue;
if (logical >= raid_map[i] &&
- logical < raid_map[i] + mapped_length)
+ logical < raid_map[i] + BTRFS_STRIPE_LEN)
break;
}
@@ -1264,125 +1442,112 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck)
+ struct scrub_block *sblocks_for_recheck[])
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * PAGE_SIZE;
- u64 logical = original_sblock->pagev[0]->logical;
- u64 generation = original_sblock->pagev[0]->generation;
- u64 flags = original_sblock->pagev[0]->flags;
- u64 have_csum = original_sblock->pagev[0]->have_csum;
+ u64 logical = original_sblock->logical;
+ u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
+ u64 generation = original_sblock->sectors[0]->generation;
+ u64 flags = original_sblock->sectors[0]->flags;
+ u64 have_csum = original_sblock->sectors[0]->have_csum;
struct scrub_recover *recover;
- struct btrfs_bio *bbio;
+ struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index = 0;
+ int sector_index = 0;
int mirror_index;
int nmirrors;
int ret;
- /*
- * note: the two members refs and outstanding_pages
- * are not used (and not set) in the blocks that are used for
- * the recheck procedure
- */
-
while (length > 0) {
- sublen = min_t(u64, length, PAGE_SIZE);
+ sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
- bbio = NULL;
+ bioc = NULL;
/*
- * with a length of PAGE_SIZE, each returned stripe
- * represents one mirror
+ * With a length of sectorsize, each returned stripe represents
+ * one mirror
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio);
- if (ret || !bbio || mapped_length < sublen) {
- btrfs_put_bbio(bbio);
+ logical, &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < sublen) {
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
- recover->bbio = bbio;
+ recover->bioc = bioc;
recover->map_length = mapped_length;
- BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
- nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+ nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
- struct scrub_page *page;
+ struct scrub_sector *sector;
- sblock = sblocks_for_recheck + mirror_index;
+ sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
- page = kzalloc(sizeof(*page), GFP_NOFS);
- if (!page) {
-leave_nomem:
+ sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
+ if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_page_get(page);
- sblock->pagev[page_index] = page;
- page->sblock = sblock;
- page->flags = flags;
- page->generation = generation;
- page->logical = logical;
- page->have_csum = have_csum;
+ sector->flags = flags;
+ sector->generation = generation;
+ sector->have_csum = have_csum;
if (have_csum)
- memcpy(page->csum,
- original_sblock->pagev[0]->csum,
- sctx->csum_size);
+ memcpy(sector->csum,
+ original_sblock->sectors[0]->csum,
+ sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
- bbio->map_type,
- bbio->raid_map,
- mapped_length,
- bbio->num_stripes -
- bbio->num_tgtdevs,
+ bioc->map_type,
+ bioc->raid_map,
+ bioc->num_stripes -
+ bioc->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
- page->physical = bbio->stripes[stripe_index].physical +
- stripe_offset;
- page->dev = bbio->stripes[stripe_index].dev;
-
- BUG_ON(page_index >= original_sblock->page_count);
- page->physical_for_dev_replace =
- original_sblock->pagev[page_index]->
- physical_for_dev_replace;
- /* for missing devices, dev->bdev is NULL */
- page->mirror_num = mirror_index + 1;
- sblock->page_count++;
- page->page = alloc_page(GFP_NOFS);
- if (!page->page)
- goto leave_nomem;
+ /*
+ * We're at the first sector, also populate @sblock
+ * physical and dev.
+ */
+ if (sector_index == 0) {
+ sblock->physical =
+ bioc->stripes[stripe_index].physical +
+ stripe_offset;
+ sblock->dev = bioc->stripes[stripe_index].dev;
+ sblock->physical_for_dev_replace =
+ original_sblock->physical_for_dev_replace;
+ }
+ BUG_ON(sector_index >= original_sblock->sector_count);
scrub_get_recover(recover);
- page->recover = recover;
+ sector->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
- page_index++;
+ sector_index++;
}
return 0;
@@ -1395,22 +1560,15 @@ static void scrub_bio_wait_endio(struct bio *bio)
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
- struct scrub_page *page)
+ struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
- int ret;
- int mirror_num;
- bio->bi_iter.bi_sector = page->logical >> 9;
+ bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+ SECTOR_SHIFT;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
-
- mirror_num = page->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
- page->recover->map_length,
- mirror_num, 0);
- if (ret)
- return ret;
+ raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
@@ -1419,26 +1577,24 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
- struct scrub_page *first_page = sblock->pagev[0];
+ struct scrub_sector *first_sector = sblock->sectors[0];
struct bio *bio;
- int page_num;
+ int i;
- /* All pages in sblock belong to the same stripe on the same device. */
- ASSERT(first_page->dev);
- if (!first_page->dev->bdev)
+ /* All sectors in sblock belong to the same stripe on the same device. */
+ ASSERT(sblock->dev);
+ if (!sblock->dev->bdev)
goto out;
- bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
- bio_set_dev(bio, first_page->dev->bdev);
+ bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct scrub_page *page = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- WARN_ON(!page->page);
- bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
}
- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
bio_put(bio);
goto out;
}
@@ -1449,65 +1605,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
return;
out:
- for (page_num = 0; page_num < sblock->page_count; page_num++)
- sblock->pagev[page_num]->io_error = 1;
+ for (i = 0; i < sblock->sector_count; i++)
+ sblock->sectors[i]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/*
- * this function will check the on disk data for checksum errors, header
- * errors and read I/O errors. If any I/O errors happen, the exact pages
- * which are errored are marked as being bad. The goal is to enable scrub
- * to take those pages that are not errored from all the mirrors so that
- * the pages that are errored in the just handled mirror can be repaired.
+ * This function will check the on disk data for checksum errors, header errors
+ * and read I/O errors. If any I/O errors happen, the exact sectors which are
+ * errored are marked as being bad. The goal is to enable scrub to take those
+ * sectors that are not errored from all the mirrors so that the sectors that
+ * are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror)
{
- int page_num;
+ int i;
sblock->no_io_error_seen = 1;
/* short cut for raid56 */
- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct bio *bio;
- struct scrub_page *page = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+ struct bio bio;
+ struct bio_vec bvec;
- if (page->dev->bdev == NULL) {
- page->io_error = 1;
+ if (sblock->dev->bdev == NULL) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!page->page);
- bio = btrfs_io_bio_alloc(1);
- bio_set_dev(bio, page->dev->bdev);
+ bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+ bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+ SECTOR_SHIFT;
- bio_add_page(bio, page->page, PAGE_SIZE, 0);
- bio->bi_iter.bi_sector = page->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
-
- if (btrfsic_submit_bio_wait(bio)) {
- page->io_error = 1;
+ btrfsic_check_bio(&bio);
+ if (submit_bio_wait(&bio)) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
}
- bio_put(bio);
+ bio_uninit(&bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
}
-static inline int scrub_check_fsid(u8 fsid[],
- struct scrub_page *spage)
+static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
- struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1520,7 +1674,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
sblock->checksum_error = 0;
sblock->generation_error = 0;
- if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
scrub_checksum_data(sblock);
else
scrub_checksum_tree_block(sblock);
@@ -1529,15 +1683,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
- int page_num;
+ int i;
int ret = 0;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (i = 0; i < sblock_bad->sector_count; i++) {
int ret_sub;
- ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_good,
- page_num, 1);
+ ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_good, i, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -1545,46 +1698,42 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
return ret;
}
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int page_num, int force_write)
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int sector_num, int force_write)
{
- struct scrub_page *page_bad = sblock_bad->pagev[page_num];
- struct scrub_page *page_good = sblock_good->pagev[page_num];
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+ struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
- BUG_ON(page_bad->page == NULL);
- BUG_ON(page_good->page == NULL);
if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || page_bad->io_error) {
- struct bio *bio;
+ sblock_bad->checksum_error || sector_bad->io_error) {
+ struct bio bio;
+ struct bio_vec bvec;
int ret;
- if (!page_bad->dev->bdev) {
+ if (!sblock_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
- bio = btrfs_io_bio_alloc(1);
- bio_set_dev(bio, page_bad->dev->bdev);
- bio->bi_iter.bi_sector = page_bad->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = (sblock_bad->physical +
+ sector_bad->offset) >> SECTOR_SHIFT;
+ ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
- ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
- if (PAGE_SIZE != ret) {
- bio_put(bio);
- return -EIO;
- }
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
- if (btrfsic_submit_bio_wait(bio)) {
- btrfs_dev_stat_inc_and_print(page_bad->dev,
+ if (ret) {
+ btrfs_dev_stat_inc_and_print(sblock_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
- bio_put(bio);
return -EIO;
}
- bio_put(bio);
}
return 0;
@@ -1593,7 +1742,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- int page_num;
+ int i;
/*
* This block is used for the check of the parity on the source device,
@@ -1602,36 +1751,60 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
if (sblock->sparity)
return;
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ for (i = 0; i < sblock->sector_count; i++) {
int ret;
- ret = scrub_write_page_to_dev_replace(sblock, page_num);
+ ret = scrub_write_sector_to_dev_replace(sblock, i);
if (ret)
atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num)
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
+{
+ const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
+ struct scrub_sector *sector = sblock->sectors[sector_num];
+
+ if (sector->io_error)
+ memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
+
+ return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
+}
+
+static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
- struct scrub_page *spage = sblock->pagev[page_num];
+ int ret = 0;
+ u64 length;
- BUG_ON(spage->page == NULL);
- if (spage->io_error) {
- void *mapped_buffer = kmap_atomic(spage->page);
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return 0;
+
+ if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
+ return 0;
- clear_page(mapped_buffer);
- flush_dcache_page(spage->page);
- kunmap_atomic(mapped_buffer);
+ if (sctx->write_pointer < physical) {
+ length = physical - sctx->write_pointer;
+
+ ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
+ sctx->write_pointer, length);
+ if (!ret)
+ sctx->write_pointer = physical;
}
- return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+ return ret;
}
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static void scrub_block_get(struct scrub_block *sblock)
{
+ refcount_inc(&sblock->refs);
+}
+
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
+{
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
int ret;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
mutex_lock(&sctx->wr_lock);
again:
@@ -1643,38 +1816,39 @@ again:
return -ENOMEM;
}
sctx->wr_curr_bio->sctx = sctx;
- sctx->wr_curr_bio->page_count = 0;
+ sctx->wr_curr_bio->sector_count = 0;
}
sbio = sctx->wr_curr_bio;
- if (sbio->page_count == 0) {
- struct bio *bio;
+ if (sbio->sector_count == 0) {
+ ret = fill_writer_pointer_gap(sctx, sector->offset +
+ sblock->physical_for_dev_replace);
+ if (ret) {
+ mutex_unlock(&sctx->wr_lock);
+ return ret;
+ }
- sbio->physical = spage->physical_for_dev_replace;
- sbio->logical = spage->logical;
+ sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
sbio->dev = sctx->wr_tgtdev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
- sbio->bio = bio;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_WRITE, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_wr_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_wr_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
- spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
- spage->logical) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sblock->physical_for_dev_replace + sector->offset ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sblock->logical + sector->offset) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
- if (sbio->page_count < 1) {
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
+ if (ret != sectorsize) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
mutex_unlock(&sctx->wr_lock);
@@ -1684,10 +1858,17 @@ again:
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- scrub_page_get(spage);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_wr_bio)
+ sbio->sectors[sbio->sector_count] = sector;
+ scrub_sector_get(sector);
+ /*
+ * Since ssector no longer holds a page, but uses sblock::pages, we
+ * have to ensure the sblock had not been freed before our write bio
+ * finished.
+ */
+ scrub_block_get(sector->sblock);
+
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
@@ -1703,13 +1884,17 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_disk);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
+
+ if (btrfs_is_zoned(sctx->fs_info))
+ sctx->write_pointer = sbio->physical + sbio->sector_count *
+ sctx->fs_info->sectorsize;
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -1720,31 +1905,37 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
- btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+ INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
+ queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_wr_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
+ sector->io_error = 1;
atomic64_inc(&dev_replace->num_write_errors);
}
}
- for (i = 0; i < sbio->page_count; i++)
- scrub_page_put(sbio->pagev[i]);
+ /*
+ * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+ * endio we should put the sblock.
+ */
+ for (i = 0; i < sbio->sector_count; i++) {
+ scrub_block_put(sbio->sectors[i]->sblock);
+ scrub_sector_put(sbio->sectors[i]);
+ }
bio_put(sbio->bio);
kfree(sbio);
@@ -1768,15 +1959,15 @@ static int scrub_checksum(struct scrub_block *sblock)
sblock->generation_error = 0;
sblock->checksum_error = 0;
- WARN_ON(sblock->page_count < 1);
- flags = sblock->pagev[0]->flags;
+ WARN_ON(sblock->sector_count < 1);
+ flags = sblock->sectors[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = scrub_checksum_tree_block(sblock);
else if (flags & BTRFS_EXTENT_FLAG_SUPER)
- (void)scrub_checksum_super(sblock);
+ ret = scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
@@ -1791,44 +1982,23 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- u8 *on_disk_csum;
- struct page *page;
- void *buffer;
- u64 len;
- int index;
+ struct scrub_sector *sector;
+ char *kaddr;
- BUG_ON(sblock->page_count < 1);
- if (!sblock->pagev[0]->have_csum)
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ if (!sector->have_csum)
return 0;
+ kaddr = scrub_sector_get_kaddr(sector);
+
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- on_disk_csum = sblock->pagev[0]->csum;
- page = sblock->pagev[0]->page;
- buffer = kmap_atomic(page);
-
- len = sctx->fs_info->sectorsize;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, PAGE_SIZE);
-
- crypto_shash_update(shash, buffer, l);
- kunmap_atomic(buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- buffer = kmap_atomic(page);
- }
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- crypto_shash_final(shash, csum);
- if (memcmp(csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(csum, sector->csum, fs_info->csum_size))
sblock->checksum_error = 1;
-
return sblock->checksum_error;
}
@@ -1840,65 +2010,59 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
- u64 len;
- int index;
+ /*
+ * This is done in sectorsize steps even for metadata as there's a
+ * constraint for nodesize to be aligned to sectorsize. This will need
+ * to change so we don't misuse data and metadata units like that.
+ */
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int i;
+ struct scrub_sector *sector;
+ char *kaddr;
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
+ BUG_ON(sblock->sector_count < 1);
+
+ /* Each member in sectors is just one sector */
+ ASSERT(sblock->sector_count == num_sectors);
- BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- h = (struct btrfs_header *)mapped_buffer;
- memcpy(on_disk_csum, h->csum, sctx->csum_size);
+ sector = sblock->sectors[0];
+ kaddr = scrub_sector_get_kaddr(sector);
+ h = (struct btrfs_header *)kaddr;
+ memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
/*
* we don't use the getter functions here, as we
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
+ if (sblock->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ if (sector->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(h->fsid, sector))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
sblock->header_error = 1;
- len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ sectorsize - BTRFS_CSUM_SIZE);
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
+ for (i = 1; i < num_sectors; i++) {
+ kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
+ crypto_shash_update(shash, kaddr, sectorsize);
}
crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->header_error || sblock->checksum_error;
@@ -1911,84 +2075,36 @@ static int scrub_checksum_super(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- u8 on_disk_csum[BTRFS_CSUM_SIZE];
- struct page *page;
- void *mapped_buffer;
- u64 mapped_size;
- void *p;
+ struct scrub_sector *sector;
+ char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- u64 len;
- int index;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
- BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0]->page;
- mapped_buffer = kmap_atomic(page);
- s = (struct btrfs_super_block *)mapped_buffer;
- memcpy(on_disk_csum, s->csum, sctx->csum_size);
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ kaddr = scrub_sector_get_kaddr(sector);
+ s = (struct btrfs_super_block *)kaddr;
- if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+ if (sblock->logical != btrfs_super_bytenr(s))
++fail_cor;
- if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+ if (sector->generation != btrfs_super_generation(s))
++fail_gen;
- if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
+ if (!scrub_check_fsid(s->fsid, sector))
++fail_cor;
- len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
- index = 0;
- for (;;) {
- u64 l = min_t(u64, len, mapped_size);
-
- crypto_shash_update(shash, p, l);
- kunmap_atomic(mapped_buffer);
- len -= l;
- if (len == 0)
- break;
- index++;
- BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index]->page);
- page = sblock->pagev[index]->page;
- mapped_buffer = kmap_atomic(page);
- mapped_size = PAGE_SIZE;
- p = mapped_buffer;
- }
+ shash->tfm = fs_info->csum_shash;
+ crypto_shash_init(shash);
+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
- crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
- if (fail_cor + fail_gen) {
- /*
- * if we find an error in a super block, we just report it.
- * They will get written with the next transaction commit
- * anyway
- */
- spin_lock(&sctx->stat_lock);
- ++sctx->stat.super_errors;
- spin_unlock(&sctx->stat_lock);
- if (fail_cor)
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- else
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
- BTRFS_DEV_STAT_GENERATION_ERRS);
- }
-
return fail_cor + fail_gen;
}
-static void scrub_block_get(struct scrub_block *sblock)
-{
- refcount_inc(&sblock->refs);
-}
-
static void scrub_block_put(struct scrub_block *sblock)
{
if (refcount_dec_and_test(&sblock->refs)) {
@@ -1997,24 +2113,86 @@ static void scrub_block_put(struct scrub_block *sblock)
if (sblock->sparity)
scrub_parity_put(sblock->sparity);
- for (i = 0; i < sblock->page_count; i++)
- scrub_page_put(sblock->pagev[i]);
+ for (i = 0; i < sblock->sector_count; i++)
+ scrub_sector_put(sblock->sectors[i]);
+ for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+ if (sblock->pages[i]) {
+ detach_scrub_page_private(sblock->pages[i]);
+ __free_page(sblock->pages[i]);
+ }
+ }
kfree(sblock);
}
}
-static void scrub_page_get(struct scrub_page *spage)
+static void scrub_sector_get(struct scrub_sector *sector)
+{
+ atomic_inc(&sector->refs);
+}
+
+static void scrub_sector_put(struct scrub_sector *sector)
{
- atomic_inc(&spage->refs);
+ if (atomic_dec_and_test(&sector->refs))
+ kfree(sector);
}
-static void scrub_page_put(struct scrub_page *spage)
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle(struct scrub_ctx *sctx)
{
- if (atomic_dec_and_test(&spage->refs)) {
- if (spage->page)
- __free_page(spage->page);
- kfree(spage);
+ const int time_slice = 1000;
+ struct scrub_bio *sbio;
+ struct btrfs_device *device;
+ s64 delta;
+ ktime_t now;
+ u32 div;
+ u64 bwlimit;
+
+ sbio = sctx->bios[sctx->curr];
+ device = sbio->dev;
+ bwlimit = READ_ONCE(device->scrub_speed_max);
+ if (bwlimit == 0)
+ return;
+
+ /*
+ * Slice is divided into intervals when the IO is submitted, adjust by
+ * bwlimit and maximum of 64 intervals.
+ */
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+ div = min_t(u32, 64, div);
+
+ /* Start new epoch, set deadline */
+ now = ktime_get();
+ if (sctx->throttle_deadline == 0) {
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+ sctx->throttle_sent = 0;
+ }
+
+ /* Still in the time to send? */
+ if (ktime_before(now, sctx->throttle_deadline)) {
+ /* If current bio is within the limit, send it */
+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
+ return;
+
+ /* We're over the limit, sleep until the rest of the slice */
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ } else {
+ /* New request after deadline, start new epoch */
+ delta = 0;
+ }
+
+ if (delta) {
+ long timeout;
+
+ timeout = div_u64(delta * HZ, 1000);
+ schedule_timeout_interruptible(timeout);
}
+
+ /* Next call will start the deadline period */
+ sctx->throttle_deadline = 0;
}
static void scrub_submit(struct scrub_ctx *sctx)
@@ -2024,17 +2202,21 @@ static void scrub_submit(struct scrub_ctx *sctx)
if (sctx->curr == -1)
return;
+ scrub_throttle(sctx);
+
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
}
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
{
- struct scrub_block *sblock = spage->sblock;
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
again:
@@ -2047,7 +2229,7 @@ again:
if (sctx->curr != -1) {
sctx->first_free = sctx->bios[sctx->curr]->next_free;
sctx->bios[sctx->curr]->next_free = -1;
- sctx->bios[sctx->curr]->page_count = 0;
+ sctx->bios[sctx->curr]->sector_count = 0;
spin_unlock(&sctx->list_lock);
} else {
spin_unlock(&sctx->list_lock);
@@ -2055,37 +2237,31 @@ again:
}
}
sbio = sctx->bios[sctx->curr];
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- sbio->physical = spage->physical;
- sbio->logical = spage->logical;
- sbio->dev = spage->dev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
- sbio->bio = bio;
+ if (sbio->sector_count == 0) {
+ sbio->physical = sblock->physical + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
+ sbio->dev = sblock->dev;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_READ, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
- spage->physical ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
- spage->logical ||
- sbio->dev != spage->dev) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sblock->physical + sector->offset ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sblock->logical + sector->offset ||
+ sbio->dev != sblock->dev) {
scrub_submit(sctx);
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
- if (sbio->page_count < 1) {
+ sbio->sectors[sbio->sector_count] = sector;
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
+ if (ret != sectorsize) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
return -EIO;
@@ -2095,9 +2271,9 @@ again:
}
scrub_block_get(sblock); /* one for the page added to the bio */
- atomic_inc(&sblock->outstanding_pages);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_rd_bio)
+ atomic_inc(&sblock->outstanding_sectors);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_submit(sctx);
return 0;
@@ -2108,15 +2284,16 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+ btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
sblock->no_io_error_seen = 0;
bio_put(bio);
- btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+ queue_work(fs_info->scrub_workers, &sblock->work);
}
-static void scrub_missing_raid56_worker(struct btrfs_work *work)
+static void scrub_missing_raid56_worker(struct work_struct *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
@@ -2124,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
u64 logical;
struct btrfs_device *dev;
- logical = sblock->pagev[0]->logical;
- dev = sblock->pagev[0]->dev;
+ logical = sblock->logical;
+ dev = sblock->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
@@ -2162,9 +2339,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = sblock->page_count * PAGE_SIZE;
- u64 logical = sblock->pagev[0]->logical;
- struct btrfs_bio *bbio = NULL;
+ u64 length = sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = sblock->logical;
+ struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
@@ -2172,61 +2349,66 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
if (WARN_ON(!sctx->is_dev_replace ||
- !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
- * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ * there's a bug in scrub_find_good_copy()/btrfs_map_block().
*/
- goto bbio_out;
+ goto bioc_out;
}
- bio = btrfs_io_bio_alloc(0);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc);
if (!rbio)
goto rbio_out;
- for (i = 0; i < sblock->page_count; i++) {
- struct scrub_page *spage = sblock->pagev[i];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+ scrub_sector_get_page_offset(sector),
+ sector->offset + sector->sblock->logical);
}
- btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
+ INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
+ btrfs_put_bioc(bioc);
return;
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
}
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum, int force,
+ u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
{
struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical,
+ physical_for_dev_replace, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2234,53 +2416,38 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
-
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
- u64 l = min_t(u64, len, PAGE_SIZE);
+ struct scrub_sector *sector;
+ /*
+ * Here we will allocate one page for one sector to scrub.
+ * This is fine if PAGE_SIZE == sectorsize, but will cost
+ * more memory for PAGE_SIZE > sectorsize case.
+ */
+ u32 l = min(sectorsize, len);
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
-leave_nomem:
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->physical_for_dev_replace = physical_for_dev_replace;
- spage->mirror_num = mirror_num;
+ sector->flags = flags;
+ sector->generation = gen;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
- goto leave_nomem;
len -= l;
logical += l;
physical += l;
physical_for_dev_replace += l;
}
- WARN_ON(sblock->page_count == 0);
+ WARN_ON(sblock->sector_count == 0);
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
@@ -2288,18 +2455,18 @@ leave_nomem:
*/
scrub_missing_raid56_pages(sblock);
} else {
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
- if (force)
+ if (flags & BTRFS_EXTENT_FLAG_SUPER)
scrub_submit(sctx);
}
@@ -2316,31 +2483,31 @@ static void scrub_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+ queue_work(fs_info->scrub_workers, &sbio->work);
}
-static void scrub_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
- spage->sblock->no_io_error_seen = 0;
+ sector->io_error = 1;
+ sector->sblock->no_io_error_seen = 0;
}
}
- /* now complete the scrub_block items that have all pages completed */
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
- struct scrub_block *sblock = spage->sblock;
+ /* Now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+ struct scrub_block *sblock = sector->sblock;
- if (atomic_dec_and_test(&sblock->outstanding_pages))
+ if (atomic_dec_and_test(&sblock->outstanding_sectors))
scrub_block_complete(sblock);
scrub_block_put(sblock);
}
@@ -2363,12 +2530,11 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
- u64 start, u64 len)
+ u64 start, u32 len)
{
u64 offset;
- u64 nsectors64;
u32 nsectors;
- int sectorsize = sparity->sctx->fs_info->sectorsize;
+ u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
if (len >= sparity->stripe_len) {
bitmap_set(bitmap, 0, sparity->nsectors);
@@ -2377,11 +2543,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
start -= sparity->logic_start;
start = div64_u64_rem(start, sparity->stripe_len, &offset);
- offset = div_u64(offset, sectorsize);
- nsectors64 = div_u64(len, sectorsize);
-
- ASSERT(nsectors64 < UINT_MAX);
- nsectors = (u32)nsectors64;
+ offset = offset >> sectorsize_bits;
+ nsectors = len >> sectorsize_bits;
if (offset + nsectors <= sparity->nsectors) {
bitmap_set(bitmap, offset, nsectors);
@@ -2393,15 +2556,15 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
- u64 start, u64 len)
+ u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
- u64 start, u64 len)
+ u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
}
static void scrub_block_complete(struct scrub_block *sblock)
@@ -2423,55 +2586,88 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->pagev[0]->logical;
- u64 end = sblock->pagev[sblock->page_count - 1]->logical +
- PAGE_SIZE;
+ u64 start = sblock->logical;
+ u64 end = sblock->logical +
+ sblock->sectors[sblock->sector_count - 1]->offset +
+ sblock->sctx->fs_info->sectorsize;
+ ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
start, end - start);
}
}
+static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
+{
+ sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
+ list_del(&sum->list);
+ kfree(sum);
+}
+
+/*
+ * Find the desired csum for range [logical, logical + sectorsize), and store
+ * the csum into @csum.
+ *
+ * The search source is sctx->csum_list, which is a pre-populated list
+ * storing bytenr ordered csum ranges. We're responsible to cleanup any range
+ * that is before @logical.
+ *
+ * Return 0 if there is no csum for the range.
+ * Return 1 if there is csum for the range and copied to @csum.
+ */
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
- struct btrfs_ordered_sum *sum = NULL;
- unsigned long index;
- unsigned long num_sectors;
+ bool found = false;
while (!list_empty(&sctx->csum_list)) {
+ struct btrfs_ordered_sum *sum = NULL;
+ unsigned long index;
+ unsigned long num_sectors;
+
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
+ /* The current csum range is beyond our range, no csum found */
if (sum->bytenr > logical)
- return 0;
- if (sum->bytenr + sum->len > logical)
break;
- ++sctx->stat.csum_discards;
- list_del(&sum->list);
- kfree(sum);
- sum = NULL;
- }
- if (!sum)
- return 0;
+ /*
+ * The current sum is before our bytenr, since scrub is always
+ * done in bytenr order, the csum will never be used anymore,
+ * clean it up so that later calls won't bother with the range,
+ * and continue search the next range.
+ */
+ if (sum->bytenr + sum->len <= logical) {
+ drop_csum_range(sctx, sum);
+ continue;
+ }
- index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
- ASSERT(index < UINT_MAX);
+ /* Now the csum range covers our bytenr, copy the csum */
+ found = true;
+ index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
+ num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
- num_sectors = sum->len / sctx->fs_info->sectorsize;
- memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
- if (index == num_sectors - 1) {
- list_del(&sum->list);
- kfree(sum);
+ memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
+ sctx->fs_info->csum_size);
+
+ /* Cleanup the range if we're at the end of the csum range */
+ if (index == num_sectors - 1)
+ drop_csum_range(sctx, sum);
+ break;
}
+ if (!found)
+ return 0;
return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u64 physical_for_dev_replace)
+ u64 gen, int mirror_num)
{
+ struct btrfs_device *src_dev = dev;
+ u64 src_physical = physical;
+ int src_mirror = mirror_num;
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
@@ -2499,8 +2695,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
WARN_ON(1);
}
+ /*
+ * For dev-replace case, we can have @dev being a missing device.
+ * Regular scrub will avoid its execution on missing device at all,
+ * as that would trigger tons of read error.
+ *
+ * Reading from missing device will cause read error counts to
+ * increase unnecessarily.
+ * So here we change the read source to a good mirror.
+ */
+ if (sctx->is_dev_replace && !dev->bdev)
+ scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
+ &src_dev, &src_mirror);
while (len) {
- u64 l = min_t(u64, len, blocksize);
+ u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -2509,29 +2717,32 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
if (have_csum == 0)
++sctx->stat.no_csum;
}
- ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
- mirror_num, have_csum ? csum : NULL, 0,
- physical_for_dev_replace);
+ ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
+ flags, gen, src_mirror,
+ have_csum ? csum : NULL, physical);
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
- physical_for_dev_replace += l;
+ src_physical += l;
}
return 0;
}
-static int scrub_pages_for_parity(struct scrub_parity *sparity,
- u64 logical, u64 len,
+static int scrub_sectors_for_parity(struct scrub_parity *sparity,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ ASSERT(IS_ALIGNED(len, sectorsize));
+
+ sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2539,75 +2750,58 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
- u64 l = min_t(u64, len, PAGE_SIZE);
+ struct scrub_sector *sector;
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
-leave_nomem:
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
- /* For scrub block */
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
+ sblock->sectors[index] = sector;
/* For scrub parity */
- scrub_page_get(spage);
- list_add_tail(&spage->list, &sparity->spages);
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->mirror_num = mirror_num;
+ scrub_sector_get(sector);
+ list_add_tail(&sector->list, &sparity->sectors_list);
+ sector->flags = flags;
+ sector->generation = gen;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
- goto leave_nomem;
- len -= l;
- logical += l;
- physical += l;
+
+ /* Iterate over the stripe range in sectorsize steps */
+ len -= sectorsize;
+ logical += sectorsize;
+ physical += sectorsize;
}
- WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ WARN_ON(sblock->sector_count == 0);
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
- /* last one frees, either here or in bio completion for last page */
+ /* Last one frees, either here or in bio completion for last sector */
scrub_block_put(sblock);
return 0;
}
static int scrub_extent_for_parity(struct scrub_parity *sparity,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num)
{
@@ -2631,7 +2825,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
while (len) {
- u64 l = min_t(u64, len, blocksize);
+ u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -2640,7 +2834,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (have_csum == 0)
goto skip;
}
- ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
if (ret)
@@ -2700,10 +2894,10 @@ static int get_raid56_logic_offset(u64 physical, int num,
static void scrub_free_parity(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
- struct scrub_page *curr, *next;
+ struct scrub_sector *curr, *next;
int nbits;
- nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
if (nbits) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors += nbits;
@@ -2711,38 +2905,38 @@ static void scrub_free_parity(struct scrub_parity *sparity)
spin_unlock(&sctx->stat_lock);
}
- list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
list_del_init(&curr->list);
- scrub_page_put(curr);
+ scrub_sector_put(curr);
}
kfree(sparity);
}
-static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+static void scrub_parity_bio_endio_worker(struct work_struct *work)
{
struct scrub_parity *sparity = container_of(work, struct scrub_parity,
work);
struct scrub_ctx *sctx = sparity->sctx;
+ btrfs_bio_counter_dec(sctx->fs_info);
scrub_free_parity(sparity);
scrub_pending_bio_dec(sctx);
}
static void scrub_parity_bio_endio(struct bio *bio)
{
- struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_parity *sparity = bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
- sparity->nsectors);
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
+ &sparity->dbitmap, sparity->nsectors);
bio_put(bio);
- btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
- NULL);
- btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
+ INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
+ queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -2751,31 +2945,32 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
u64 length;
int ret;
- if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
- sparity->nsectors))
+ if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
+ &sparity->ebitmap, sparity->nsectors))
goto out;
length = sparity->logic_end - sparity->logic_start;
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio);
- if (ret || !bbio || !bbio->raid_map)
- goto bbio_out;
+ &length, &bioc);
+ if (ret || !bioc || !bioc->raid_map)
+ goto bioc_out;
- bio = btrfs_io_bio_alloc(0);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
- length, sparity->scrub_dev,
- sparity->dbitmap,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
+ sparity->scrub_dev,
+ &sparity->dbitmap,
sparity->nsectors);
+ btrfs_put_bioc(bioc);
if (!rbio)
goto rbio_out;
@@ -2785,10 +2980,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
-bbio_out:
+bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bbio(bbio);
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2797,11 +2991,6 @@ out:
scrub_free_parity(sparity);
}
-static inline int scrub_calc_parity_bitmap_len(int nsectors)
-{
- return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
-}
-
static void scrub_parity_get(struct scrub_parity *sparity)
{
refcount_inc(&sparity->refs);
@@ -2815,46 +3004,287 @@ static void scrub_parity_put(struct scrub_parity *sparity)
scrub_parity_check_and_repair(sparity);
}
+/*
+ * Return 0 if the extent item range covers any byte of the range.
+ * Return <0 if the extent item is before @search_start.
+ * Return >0 if the extent item is after @start_start + @search_len.
+ */
+static int compare_extent_item_range(struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
+ u64 len;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ len = fs_info->nodesize;
+ else
+ len = key.offset;
+
+ if (key.objectid + len <= search_start)
+ return -1;
+ if (key.objectid >= search_start + search_len)
+ return 1;
+ return 0;
+}
+
+/*
+ * Locate one extent item which covers any byte in range
+ * [@search_start, @search_start + @search_length)
+ *
+ * If the path is not initialized, we will initialize the search by doing
+ * a btrfs_search_slot().
+ * If the path is already initialized, we will use the path as the initial
+ * slot, to avoid duplicated btrfs_search_slot() calls.
+ *
+ * NOTE: If an extent item starts before @search_start, we will still
+ * return the extent item. This is for data extent crossing stripe boundary.
+ *
+ * Return 0 if we found such extent item, and @path will point to the extent item.
+ * Return >0 if no such extent item can be found, and @path will be released.
+ * Return <0 if hit fatal error, and @path will be released.
+ */
+static int find_first_extent_item(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct btrfs_key key;
+ int ret;
+
+ /* Continue using the existing path */
+ if (path->nodes[0])
+ goto search_forward;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = search_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(ret > 0);
+ /*
+ * Here we intentionally pass 0 as @min_objectid, as there could be
+ * an extent item starting before @search_start.
+ */
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
+ if (ret < 0)
+ return ret;
+ /*
+ * No matter whether we have found an extent item, the next loop will
+ * properly do every check on the key.
+ */
+search_forward:
+ while (true) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid >= search_start + search_len)
+ break;
+ if (key.type != BTRFS_METADATA_ITEM_KEY &&
+ key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto next;
+
+ ret = compare_extent_item_range(path, search_start, search_len);
+ if (ret == 0)
+ return ret;
+ if (ret > 0)
+ break;
+next:
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret) {
+ /* Either no more item or fatal error */
+ btrfs_release_path(path);
+ return ret;
+ }
+ }
+ }
+ btrfs_release_path(path);
+ return 1;
+}
+
+static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
+ u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_item *ei;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_EXTENT_ITEM_KEY);
+ *extent_start_ret = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ *size_ret = path->nodes[0]->fs_info->nodesize;
+ else
+ *size_ret = key.offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
+ *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
+ *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
+}
+
+static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
+ u64 boundary_start, u64 boudary_len)
+{
+ return (extent_start < boundary_start &&
+ extent_start + extent_len > boundary_start) ||
+ (extent_start < boundary_start + boudary_len &&
+ extent_start + extent_len > boundary_start + boudary_len);
+}
+
+static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
+ struct scrub_parity *sparity,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logical)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
+ u64 cur_logical = logical;
+ int ret;
+
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+
+ /* Path must not be populated */
+ ASSERT(!path->nodes[0]);
+
+ while (cur_logical < logical + map->stripe_len) {
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_device *extent_dev;
+ u64 extent_start;
+ u64 extent_size;
+ u64 mapped_length;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 extent_physical;
+ u64 extent_mirror_num;
+
+ ret = find_first_extent_item(extent_root, path, cur_logical,
+ logical + map->stripe_len - cur_logical);
+ /* No more extent item in this data stripe */
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(path, &extent_start, &extent_size, &extent_flags,
+ &extent_gen);
+
+ /* Metadata should not cross stripe boundaries */
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_size,
+ logical, map->stripe_len)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ extent_start, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += extent_size;
+ continue;
+ }
+
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /* Truncate the range inside this data stripe */
+ extent_size = min(extent_start + extent_size,
+ logical + map->stripe_len) - cur_logical;
+ extent_start = cur_logical;
+ ASSERT(extent_size <= U32_MAX);
+
+ scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+
+ mapped_length = extent_size;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
+ &mapped_length, &bioc, 0);
+ if (!ret && (!bioc || mapped_length < extent_size))
+ ret = -EIO;
+ if (ret) {
+ btrfs_put_bioc(bioc);
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+
+ ret = btrfs_lookup_csums_range(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1, false);
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ ret = scrub_extent_for_parity(sparity, extent_start,
+ extent_size, extent_physical,
+ extent_dev, extent_flags,
+ extent_gen, extent_mirror_num);
+ scrub_free_csums(sctx);
+
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ cond_resched();
+ cur_logical += extent_size;
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
- struct btrfs_path *path,
u64 logic_start,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_root *csum_root = fs_info->csum_root;
- struct btrfs_extent_item *extent;
- struct btrfs_bio *bbio = NULL;
- u64 flags;
+ struct btrfs_path *path;
+ u64 cur_logical;
int ret;
- int slot;
- struct extent_buffer *l;
- struct btrfs_key key;
- u64 generation;
- u64 extent_logical;
- u64 extent_physical;
- u64 extent_len;
- u64 mapped_length;
- struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
- int bitmap_len;
- int extent_mirror_num;
- int stop_loop = 0;
- nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
- bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
- sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
- GFP_NOFS);
+ path = btrfs_alloc_path();
+ if (!path) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ASSERT(map->stripe_len <= U32_MAX);
+ nsectors = map->stripe_len >> fs_info->sectorsize_bits;
+ ASSERT(nsectors <= BITS_PER_LONG);
+ sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
if (!sparity) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
+ btrfs_free_path(path);
return -ENOMEM;
}
+ ASSERT(map->stripe_len <= U32_MAX);
sparity->stripe_len = map->stripe_len;
sparity->nsectors = nsectors;
sparity->sctx = sctx;
@@ -2862,258 +3292,299 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
- INIT_LIST_HEAD(&sparity->spages);
- sparity->dbitmap = sparity->bitmap;
- sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+ INIT_LIST_HEAD(&sparity->sectors_list);
ret = 0;
- while (logic_start < logic_end) {
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logic_start;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ for (cur_logical = logic_start; cur_logical < logic_end;
+ cur_logical += map->stripe_len) {
+ ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
+ sdev, path, cur_logical);
if (ret < 0)
- goto out;
+ break;
+ }
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
+ scrub_parity_put(sparity);
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
- stop_loop = 0;
- while (1) {
- u64 bytes;
+ btrfs_free_path(path);
+ return ret < 0 ? ret : 0;
+}
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
+static void sync_replace_for_zoned(struct scrub_ctx *sctx)
+{
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return;
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+}
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+ u64 physical, u64 physical_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ int ret = 0;
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
- if (key.objectid + bytes <= logic_start)
- goto next;
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
- if (key.objectid >= logic_end) {
- stop_loop = 1;
- break;
- }
+ mutex_lock(&sctx->wr_lock);
+ if (sctx->write_pointer < physical_end) {
+ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+ physical,
+ sctx->write_pointer);
+ if (ret)
+ btrfs_err(fs_info,
+ "zoned: failed to recover write pointer");
+ }
+ mutex_unlock(&sctx->wr_lock);
+ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
- while (key.objectid >= logic_start + map->stripe_len)
- logic_start += map->stripe_len;
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logic_start ||
- key.objectid + bytes >
- logic_start + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-again:
- extent_logical = key.objectid;
- extent_len = bytes;
+ return ret;
+}
- if (extent_logical < logic_start) {
- extent_len -= logic_start - extent_logical;
- extent_logical = logic_start;
- }
+/*
+ * Scrub one range which can only has simple mirror based profile.
+ * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
+ * RAID0/RAID10).
+ *
+ * Since we may need to handle a subset of block group, we need @logical_start
+ * and @logical_length parameter.
+ */
+static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ u64 logical_start, u64 logical_length,
+ struct btrfs_device *device,
+ u64 physical, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ const u64 logical_end = logical_start + logical_length;
+ /* An artificial limit, inherit from old scrub behavior */
+ const u32 max_length = SZ_64K;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ int ret;
- if (extent_logical + extent_len >
- logic_start + map->stripe_len)
- extent_len = logic_start + map->stripe_len -
- extent_logical;
-
- scrub_parity_mark_sectors_data(sparity, extent_logical,
- extent_len);
-
- mapped_length = extent_len;
- bbio = NULL;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bbio,
- 0);
- if (!ret) {
- if (!bbio || mapped_length < extent_len)
- ret = -EIO;
- }
- if (ret) {
- btrfs_put_bbio(bbio);
- goto out;
- }
- extent_physical = bbio->stripes[0].physical;
- extent_mirror_num = bbio->mirror_num;
- extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
-
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
+ /* The range must be inside the bg */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+ /* Go through each extent items inside the logical range */
+ while (cur_logical < logical_end) {
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 scrub_len;
- ret = scrub_extent_for_parity(sparity, extent_logical,
- extent_len,
- extent_physical,
- extent_dev, flags,
- generation,
- extent_mirror_num);
+ /* Canceled? */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ break;
+ }
+ /* Paused? */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* Push queued extents */
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ sctx->flush_all_writes = false;
+ scrub_blocked_if_needed(fs_info);
+ }
+ /* Block group removed? */
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&bg->lock);
- scrub_free_csums(sctx);
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ logical_end - cur_logical);
+ if (ret > 0) {
+ /* No more extent, just update the accounting */
+ sctx->stat.last_physical = physical + logical_length;
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+ /*
+ * Scrub len has three limits:
+ * - Extent size limit
+ * - Scrub range limit
+ * This is especially imporatant for RAID0/RAID10 to reuse
+ * this function
+ * - Max scrub size limit
+ */
+ scrub_len = min(min(extent_start + extent_len,
+ logical_end), cur_logical + max_length) -
+ cur_logical;
+
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ cur_logical + scrub_len - 1,
+ &sctx->csum_list, 1, false);
if (ret)
- goto out;
+ break;
+ }
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_len,
+ logical_start, logical_length)) {
+ btrfs_err(fs_info,
+"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
+ extent_start, logical_start, logical_end);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += scrub_len;
+ continue;
+ }
+ ret = scrub_extent(sctx, map, cur_logical, scrub_len,
+ cur_logical - logical_start + physical,
+ device, extent_flags, extent_gen,
+ mirror_num);
+ scrub_free_csums(sctx);
+ if (ret)
+ break;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+ cur_logical += scrub_len;
+ /* Don't hold CPU for too long time */
+ cond_resched();
+ }
+ btrfs_release_path(&path);
+ return ret;
+}
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- logic_start += map->stripe_len;
+/* Calculate the full stripe length for simple stripe based profiles */
+static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
- if (logic_start >= logic_end) {
- stop_loop = 1;
- break;
- }
+ return map->num_stripes / map->sub_stripes * map->stripe_len;
+}
- if (logic_start < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
- }
-next:
- path->slots[0]++;
- }
+/* Get the logical bytenr for the stripe */
+static u64 simple_stripe_get_logical(struct map_lookup *map,
+ struct btrfs_block_group *bg,
+ int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
- btrfs_release_path(path);
+ /*
+ * (stripe_index / sub_stripes) gives how many data stripes we need to
+ * skip.
+ */
+ return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+}
- if (stop_loop)
- break;
+/* Get the mirror number for the stripe */
+static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
- logic_start += map->stripe_len;
- }
-out:
- if (ret < 0)
- scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start);
- scrub_parity_put(sparity);
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
+ /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
+ return stripe_index % map->sub_stripes + 1;
+}
- btrfs_release_path(path);
- return ret < 0 ? ret : 0;
+static int scrub_simple_stripe(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct btrfs_device *device,
+ int stripe_index)
+{
+ const u64 logical_increment = simple_stripe_full_stripe_len(map);
+ const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
+ const u64 orig_physical = map->stripes[stripe_index].physical;
+ const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
+ u64 cur_logical = orig_logical;
+ u64 cur_physical = orig_physical;
+ int ret = 0;
+
+ while (cur_logical < bg->start + bg->length) {
+ /*
+ * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
+ * just RAID1, so we can reuse scrub_simple_mirror() to scrub
+ * this stripe.
+ */
+ ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
+ cur_logical, map->stripe_len, device,
+ cur_physical, mirror_num);
+ if (ret)
+ return ret;
+ /* Skip to next stripe which belongs to the target device */
+ cur_logical += logical_increment;
+ /* For physical offset, we just go to next stripe */
+ cur_physical += map->stripe_len;
+ }
+ return ret;
}
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
- struct map_lookup *map,
+ struct btrfs_block_group *bg,
+ struct extent_map *em,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length)
+ int stripe_index)
{
- struct btrfs_path *path, *ppath;
+ struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = fs_info->extent_root;
- struct btrfs_root *csum_root = fs_info->csum_root;
- struct btrfs_extent_item *extent;
+ struct btrfs_root *root;
+ struct btrfs_root *csum_root;
struct blk_plug plug;
- u64 flags;
+ struct map_lookup *map = em->map_lookup;
+ const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ const u64 chunk_logical = bg->start;
int ret;
- int slot;
- u64 nstripes;
- struct extent_buffer *l;
- u64 physical;
+ u64 physical = map->stripes[stripe_index].physical;
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
- u64 physical_end;
- u64 generation;
- int mirror_num;
- struct reada_control *reada1;
- struct reada_control *reada2;
- struct btrfs_key key;
- struct btrfs_key key_end;
- u64 increment = map->stripe_len;
+ /* The logical increment after finishing one stripe */
+ u64 increment;
+ /* Offset inside the chunk */
u64 offset;
- u64 extent_logical;
- u64 extent_physical;
- u64 extent_len;
u64 stripe_logical;
u64 stripe_end;
- struct btrfs_device *extent_dev;
- int extent_mirror_num;
int stop_loop = 0;
- physical = map->stripes[num].physical;
- offset = 0;
- nstripes = div64_u64(length, map->stripe_len);
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- offset = map->stripe_len * num;
- increment = map->stripe_len * map->num_stripes;
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- int factor = map->num_stripes / map->sub_stripes;
- offset = map->stripe_len * (num / map->sub_stripes);
- increment = map->stripe_len * factor;
- mirror_num = num % map->sub_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- increment = map->stripe_len;
- mirror_num = num % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- increment = map->stripe_len;
- mirror_num = num % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical, num, map, &offset, NULL);
- increment = map->stripe_len * nr_data_stripes(map);
- mirror_num = 1;
- } else {
- increment = map->stripe_len;
- mirror_num = 1;
- }
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ppath = btrfs_alloc_path();
- if (!ppath) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
-
/*
* work on commit root. The related disk blocks are static as
* long as COW is applied. This means, it is save to rewrite
@@ -3121,49 +3592,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
path->search_commit_root = 1;
path->skip_locking = 1;
+ path->reada = READA_FORWARD;
- ppath->search_commit_root = 1;
- ppath->skip_locking = 1;
- /*
- * trigger the readahead for extent tree csum tree and wait for
- * completion. During readahead, the scrub is officially paused
- * to not hold off transaction commits
- */
- logical = base + offset;
- physical_end = physical + nstripes * map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical_end, num,
- map, &logic_end, NULL);
- logic_end += base;
- } else {
- logic_end = logical + increment * nstripes;
- }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- /* FIXME it might be better to start readahead at commit root */
- key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)0;
- key_end.objectid = logic_end;
- key_end.type = BTRFS_METADATA_ITEM_KEY;
- key_end.offset = (u64)-1;
- reada1 = btrfs_reada_add(root, &key, &key_end);
-
- key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.type = BTRFS_EXTENT_CSUM_KEY;
- key.offset = logical;
- key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key, &key_end);
-
- if (!IS_ERR(reada1))
- btrfs_reada_wait(reada1);
- if (!IS_ERR(reada2))
- btrfs_reada_wait(reada2);
-
+ root = btrfs_extent_root(fs_info, bg->start);
+ csum_root = btrfs_csum_root(fs_info, bg->start);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -3171,229 +3607,98 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
blk_start_plug(&plug);
+ if (sctx->is_dev_replace &&
+ btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
+ mutex_lock(&sctx->wr_lock);
+ sctx->write_pointer = physical;
+ mutex_unlock(&sctx->wr_lock);
+ sctx->flush_all_writes = true;
+ }
+
/*
- * now find all extents for each stripe and scrub them
+ * There used to be a big double loop to handle all profiles using the
+ * same routine, which grows larger and more gross over time.
+ *
+ * So here we handle each profile differently, so simpler profiles
+ * have simpler scrubbing function.
*/
- ret = 0;
- while (physical < physical_end) {
- /*
- * canceled?
- */
- if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sctx->cancel_req)) {
- ret = -ECANCELED;
- goto out;
- }
+ if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
- * check to see if we have to pause
+ * Above check rules out all complex profile, the remaining
+ * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
+ * mirrored duplication without stripe.
+ *
+ * Only @physical and @mirror_num needs to calculated using
+ * @stripe_index.
*/
- if (atomic_read(&fs_info->scrub_pause_req)) {
- /* push queued extents */
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
- sctx->flush_all_writes = false;
- scrub_blocked_if_needed(fs_info);
- }
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, num, map,
- &logical,
- &stripe_logical);
- logical += base;
- if (ret) {
- /* it is parity strip */
- stripe_logical += base;
- stripe_end = stripe_logical + increment;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- ppath, stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
-
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- /* there's no smaller item, so stick with the
- * larger one */
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logical)
- goto next;
-
- if (key.objectid >= logical + map->stripe_len) {
- /* out of this device extent */
- if (key.objectid >= logic_end)
- stop_loop = 1;
- break;
- }
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logical ||
- key.objectid + bytes >
- logical + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logical);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-
-again:
- extent_logical = key.objectid;
- extent_len = bytes;
-
- /*
- * trim extent to this stripe
- */
- if (extent_logical < logical) {
- extent_len -= logical - extent_logical;
- extent_logical = logical;
- }
- if (extent_logical + extent_len >
- logical + map->stripe_len) {
- extent_len = logical + map->stripe_len -
- extent_logical;
- }
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ bg->start, bg->length, scrub_dev,
+ map->stripes[stripe_index].physical,
+ stripe_index + 1);
+ offset = 0;
+ goto out;
+ }
+ if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
+ scrub_dev, stripe_index);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ goto out;
+ }
- extent_physical = extent_logical - logical + physical;
- extent_dev = scrub_dev;
- extent_mirror_num = mirror_num;
- if (sctx->is_dev_replace)
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
-
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical +
- extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
+ /* Only RAID56 goes through the old code */
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ ret = 0;
- ret = scrub_extent(sctx, map, extent_logical, extent_len,
- extent_physical, extent_dev, flags,
- generation, extent_mirror_num,
- extent_logical - logical + physical);
+ /* Calculate the logical end of the stripe */
+ get_raid56_logic_offset(physical_end, stripe_index,
+ map, &logic_end, NULL);
+ logic_end += chunk_logical;
- scrub_free_csums(sctx);
+ /* Initialize @offset in case we need to go to out: label */
+ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
+ increment = map->stripe_len * nr_data_stripes(map);
+ /*
+ * Due to the rotation, for RAID56 it's better to iterate each stripe
+ * using their physical offset.
+ */
+ while (physical < physical_end) {
+ ret = get_raid56_logic_offset(physical, stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += chunk_logical;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ stripe_logical,
+ stripe_end);
if (ret)
goto out;
+ goto next;
+ }
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /*
- * loop until we find next data stripe
- * or we have finished all stripes.
- */
-loop:
- physical += map->stripe_len;
- ret = get_raid56_logic_offset(physical,
- num, map, &logical,
- &stripe_logical);
- logical += base;
-
- if (ret && physical < physical_end) {
- stripe_logical += base;
- stripe_end = stripe_logical +
- increment;
- ret = scrub_raid56_parity(sctx,
- map, scrub_dev, ppath,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto loop;
- }
- } else {
- physical += map->stripe_len;
- logical += increment;
- }
- if (logical < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
-
- if (physical >= physical_end) {
- stop_loop = 1;
- break;
- }
- }
+ /*
+ * Now we're at a data stripe, scrub each extents in the range.
+ *
+ * At this stage, if we ignore the repair part, inside each data
+ * stripe it is no different than SINGLE profile.
+ * We can reuse scrub_simple_mirror() here, as the repair part
+ * is still based on @mirror_num.
+ */
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ logical, map->stripe_len,
+ scrub_dev, physical, 1);
+ if (ret < 0)
+ goto out;
next:
- path->slots[0]++;
- }
- btrfs_release_path(path);
-skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
- sctx->stat.last_physical = map->stripes[num].physical +
- length;
+ sctx->stat.last_physical =
+ map->stripes[stripe_index].physical + dev_stripe_len;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
@@ -3409,15 +3714,26 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
- btrfs_free_path(ppath);
+
+ if (sctx->is_dev_replace && ret >= 0) {
+ int ret2;
+
+ ret2 = sync_write_pointer_for_zoned(sctx,
+ chunk_logical + offset,
+ map->stripes[stripe_index].physical,
+ physical_end);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret < 0 ? ret : 0;
}
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+ struct btrfs_block_group *bg,
struct btrfs_device *scrub_dev,
- u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group *cache)
+ u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
@@ -3427,7 +3743,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
int ret = 0;
read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, chunk_offset, 1);
+ em = lookup_extent_mapping(map_tree, bg->start, bg->length);
read_unlock(&map_tree->lock);
if (!em) {
@@ -3435,26 +3751,23 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
*/
- spin_lock(&cache->lock);
- if (!cache->removed)
+ spin_lock(&bg->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
ret = -EINVAL;
- spin_unlock(&cache->lock);
+ spin_unlock(&bg->lock);
return ret;
}
-
- map = em->map_lookup;
- if (em->start != chunk_offset)
+ if (em->start != bg->start)
goto out;
-
- if (em->len < length)
+ if (em->len < dev_extent_len)
goto out;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length);
+ ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
if (ret)
goto out;
}
@@ -3465,6 +3778,25 @@ out:
return ret;
}
+static int finish_extent_writes_for_zoned(struct btrfs_root *root,
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_trans_handle *trans;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_wait_block_group_reservations(cache);
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ return btrfs_commit_transaction(trans);
+}
+
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
@@ -3473,7 +3805,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- u64 length;
u64 chunk_offset;
int ret = 0;
int ro_set;
@@ -3497,6 +3828,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
key.type = BTRFS_DEV_EXTENT_KEY;
while (1) {
+ u64 dev_extent_len;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
break;
@@ -3533,9 +3866,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
- length = btrfs_dev_extent_length(l, dev_extent);
+ dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start)
+ if (found_key.offset + dev_extent_len <= start)
goto skip;
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
@@ -3551,6 +3884,55 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
+ if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+ }
+
+ /*
+ * Make sure that while we are scrubbing the corresponding block
+ * group doesn't get its logical address and its device extents
+ * reused for another block group, which can possibly be of a
+ * different type and different profile. We do this to prevent
+ * false error detections and crashes due to bogus attempts to
+ * repair extents.
+ */
+ spin_lock(&cache->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+ btrfs_freeze_block_group(cache);
+ spin_unlock(&cache->lock);
+
/*
* we need call btrfs_inc_block_group_ro() with scrubs_paused,
* to avoid deadlock caused by:
@@ -3592,6 +3974,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* group is not RO.
*/
ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
+ if (!ret && sctx->is_dev_replace) {
+ ret = finish_extent_writes_for_zoned(root, cache);
+ if (ret) {
+ btrfs_dec_block_group_ro(cache);
+ scrub_pause_off(fs_info);
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+
if (ret == 0) {
ro_set = 1;
} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
@@ -3603,9 +3995,17 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* commit_transactions.
*/
ro_set = 0;
+ } else if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ cache->start);
+ scrub_pause_off(fs_info);
+ ret = 0;
+ goto skip_unfreeze;
} else {
btrfs_warn(fs_info,
"failed setting block group ro: %d", ret);
+ btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
break;
@@ -3624,13 +4024,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
- dev_replace->cursor_right = found_key.offset + length;
+ dev_replace->cursor_right = found_key.offset + dev_extent_len;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache);
+ ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
+ dev_extent_len);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3664,6 +4064,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
+ if (sctx->is_dev_replace &&
+ !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
+ cache, found_key.offset))
+ ro_set = 0;
+
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
@@ -3680,8 +4085,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* balance is triggered or it becomes used and unused again.
*/
spin_lock(&cache->lock);
- if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- cache->used == 0) {
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
+ !cache->ro && cache->reserved == 0 && cache->used == 0) {
spin_unlock(&cache->lock);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_queue_work(&fs_info->discard_ctl,
@@ -3691,7 +4096,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
} else {
spin_unlock(&cache->lock);
}
-
+skip_unfreeze:
+ btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
if (ret)
break;
@@ -3705,7 +4111,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
skip:
- key.offset = found_key.offset + length;
+ key.offset = found_key.offset + dev_extent_len;
btrfs_release_path(path);
}
@@ -3723,8 +4129,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
int ret;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- return -EIO;
+ if (BTRFS_FS_ERROR(fs_info))
+ return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
@@ -3737,10 +4143,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (bytenr + BTRFS_SUPER_INFO_SIZE >
scrub_dev->commit_total_bytes)
break;
+ if (!btrfs_check_super_location(scrub_dev, bytenr))
+ continue;
- ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, 1, bytenr);
+ ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, bytenr);
if (ret)
return ret;
}
@@ -3749,125 +4157,136 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
return 0;
}
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
+ struct workqueue_struct *scrub_wr_comp =
+ fs_info->scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity =
+ fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ if (scrub_workers)
+ destroy_workqueue(scrub_workers);
+ if (scrub_wr_comp)
+ destroy_workqueue(scrub_wr_comp);
+ if (scrub_parity)
+ destroy_workqueue(scrub_parity);
+ }
+}
+
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
+ struct workqueue_struct *scrub_workers = NULL;
+ struct workqueue_struct *scrub_wr_comp = NULL;
+ struct workqueue_struct *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
- lockdep_assert_held(&fs_info->scrub_lock);
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
- if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL);
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
- flags, is_dev_replace ? 1 : max_active, 4);
- if (!fs_info->scrub_workers)
- goto fail_scrub_workers;
-
- ASSERT(fs_info->scrub_wr_completion_workers == NULL);
- fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
- if (!fs_info->scrub_wr_completion_workers)
- goto fail_scrub_wr_completion_workers;
-
- ASSERT(fs_info->scrub_parity_workers == NULL);
- fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
- max_active, 2);
- if (!fs_info->scrub_parity_workers)
- goto fail_scrub_parity_workers;
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags,
+ is_dev_replace ? 1 : max_active);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
+
+ scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+ scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
- } else {
- refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
}
- return 0;
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = 0;
+ destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(fs_info->scrub_workers);
+ destroy_workqueue(scrub_workers);
fail_scrub_workers:
- return -ENOMEM;
+ return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
- if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
- /*
- * in this case scrub is unable to calculate the checksum
- * the way scrub is implemented. Do not handle this
- * situation at all because it won't ever happen.
- */
- btrfs_err(fs_info,
- "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
- fs_info->nodesize,
- BTRFS_STRIPE_LEN);
- return -EINVAL;
- }
-
- if (fs_info->sectorsize != PAGE_SIZE) {
- /* not supported for data w/o checksums */
- btrfs_err_rl(fs_info,
- "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
- fs_info->sectorsize, PAGE_SIZE);
- return -EINVAL;
- }
+ /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
- if (fs_info->nodesize >
- PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
- fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
- /*
- * would exhaust the array bounds of pagev member in
- * struct scrub_block
- */
- btrfs_err(fs_info,
- "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->nodesize,
- SCRUB_MAX_PAGES_PER_BLOCK,
- fs_info->sectorsize,
- SCRUB_MAX_PAGES_PER_BLOCK);
- return -EINVAL;
- }
+ /*
+ * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
+ * value (max nodesize / min sectorsize), thus nodesize should always
+ * be fine.
+ */
+ ASSERT(fs_info->nodesize <=
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
/* Allocate outside of device_list_mutex */
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
- goto out_free_ctx;
+ goto out;
}
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
- rcu_str_deref(dev->name));
+ btrfs_err_in_rcu(fs_info,
+ "scrub on devid %llu: filesystem on %s is not writable",
+ devid, rcu_str_deref(dev->name));
ret = -EROFS;
- goto out_free_ctx;
+ goto out;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3876,7 +4295,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
- goto out_free_ctx;
+ goto out;
}
down_read(&fs_info->dev_replace.rwsem);
@@ -3887,17 +4306,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
- goto out_free_ctx;
+ goto out;
}
up_read(&fs_info->dev_replace.rwsem);
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out_free_ctx;
- }
-
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3913,7 +4325,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
/*
* In order to avoid deadlock with reclaim when there is a transaction
* trying to pause scrub, make sure we use GFP_NOFS for all the
- * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
* invoked by our callees. The pausing request is done when the
* transaction commit starts, and it blocks the transaction until scrub
* is paused (done at specific points at scrub_stripe() or right above
@@ -3921,6 +4333,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -3929,6 +4347,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -3950,24 +4378,33 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
-
- fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
- }
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
-
+out:
+ scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
@@ -4042,11 +4479,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4056,27 +4494,27 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u64 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num)
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
{
u64 mapped_length;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int ret;
mapped_length = extent_len;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bbio, 0);
- if (ret || !bbio || mapped_length < extent_len ||
- !bbio->stripes[0].dev->bdev) {
- btrfs_put_bbio(bbio);
+ &mapped_length, &bioc, 0);
+ if (ret || !bioc || mapped_length < extent_len ||
+ !bioc->stripes[0].dev->bdev) {
+ btrfs_put_bioc(bioc);
return;
}
- *extent_physical = bbio->stripes[0].physical;
- *extent_mirror_num = bbio->mirror_num;
- *extent_dev = bbio->stripes[0].dev;
- btrfs_put_bbio(bbio);
+ *extent_physical = bioc->stripes[0].physical;
+ *extent_mirror_num = bioc->mirror_num;
+ *extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a055b657cb85..145c84b44fd0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -15,14 +15,18 @@
#include <linux/string.h>
#include <linux/compat.h>
#include <linux/crc32c.h>
+#include <linux/fsverity.h>
#include "send.h"
+#include "ctree.h"
#include "backref.h"
#include "locking.h"
#include "disk-io.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
+#include "xattr.h"
+#include "print-tree.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -80,9 +84,15 @@ struct send_ctx {
char *send_buf;
u32 send_size;
u32 send_max_size;
- u64 total_send_size;
- u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ /*
+ * Whether BTRFS_SEND_A_DATA attribute was already added to current
+ * command (since protocol v2, data must be the last attribute).
+ */
+ bool put_data;
+ struct page **send_buf_pages;
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
+ /* Protocol version compatibility requested */
+ u32 proto;
struct btrfs_root *send_root;
struct btrfs_root *parent_root;
@@ -95,20 +105,31 @@ struct send_ctx {
struct btrfs_key *cmp_key;
/*
+ * Keep track of the generation of the last transaction that was used
+ * for relocating a block group. This is periodically checked in order
+ * to detect if a relocation happened since the last check, so that we
+ * don't operate on stale extent buffers for nodes (level >= 1) or on
+ * stale disk_bytenr values of file extent items.
+ */
+ u64 last_reloc_trans;
+
+ /*
* infos of the currently processed inode. In case of deleted inodes,
* these are the values from the deleted inode.
*/
u64 cur_ino;
u64 cur_inode_gen;
- int cur_inode_new;
- int cur_inode_new_gen;
- int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool cur_inode_new;
+ bool cur_inode_new_gen;
+ bool cur_inode_deleted;
bool ignore_cur_inode;
+ bool cur_inode_needs_verity;
+ void *verity_descriptor;
u64 send_progress;
@@ -119,9 +140,14 @@ struct send_ctx {
struct list_head name_cache_list;
int name_cache_size;
+ /*
+ * The inode we are currently processing. It's not NULL only when we
+ * need to issue write commands for data extents from this inode.
+ */
+ struct inode *cur_inode;
struct file_ra_state ra;
-
- char *read_buf;
+ u64 page_cache_clear_start;
+ bool clean_page_cache;
/*
* We process inodes by their increasing order, so if before an
@@ -217,6 +243,9 @@ struct send_ctx {
* Indexed by the inode number of the directory to be deleted.
*/
struct rb_root orphan_dirs;
+
+ struct rb_root rbtree_new_refs;
+ struct rb_root rbtree_deleted_refs;
};
struct pending_dir_move {
@@ -237,6 +266,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ u64 rmdir_gen;
bool orphanized;
};
@@ -277,11 +307,6 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -317,12 +342,23 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
sctx->parent_root->root_key.objectid : 0));
}
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+ switch (sctx->proto) {
+ case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
+ case 3: return cmd <= BTRFS_SEND_C_MAX_V3;
+ default: return false;
+ }
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
-static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
static int need_send_hole(struct send_ctx *sctx)
{
@@ -511,17 +547,12 @@ out:
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
- int ret;
-
p->reversed = from->reversed;
fs_path_reset(p);
- ret = fs_path_add_path(p, from);
-
- return ret;
+ return fs_path_add_path(p, from);
}
-
static void fs_path_unreverse(struct fs_path *p)
{
char *tmp;
@@ -558,15 +589,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
while (pos < len) {
ret = kernel_write(filp, buf + pos, len - pos, off);
- /* TODO handle that correctly */
- /*if (ret == -ERESTARTSYS) {
- continue;
- }*/
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (ret == 0)
return -EIO;
- }
pos += ret;
}
@@ -579,12 +605,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
int total_len = sizeof(*hdr) + len;
int left = sctx->send_max_size - sctx->send_size;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+
if (unlikely(left < total_len))
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -599,6 +628,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
}
+TLV_PUT_DEFINE_INT(8)
+TLV_PUT_DEFINE_INT(32)
TLV_PUT_DEFINE_INT(64)
static int tlv_put_string(struct send_ctx *sctx, u16 attr,
@@ -674,8 +705,7 @@ static int send_header(struct send_ctx *sctx)
struct btrfs_stream_header hdr;
strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
- hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-
+ hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
}
@@ -694,7 +724,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -706,18 +736,17 @@ static int send_cmd(struct send_ctx *sctx)
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
- sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
+ sctx->put_data = false;
return ret;
}
@@ -818,17 +847,32 @@ out:
return ret;
}
+struct btrfs_inode_info {
+ u64 size;
+ u64 gen;
+ u64 mode;
+ u64 uid;
+ u64 gid;
+ u64 rdev;
+ u64 fileattr;
+ u64 nlink;
+};
+
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
- u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
- u64 *gid, u64 *rdev)
+static int get_inode_info(struct btrfs_root *root, u64 ino,
+ struct btrfs_inode_info *info)
{
int ret;
+ struct btrfs_path *path;
struct btrfs_inode_item *ii;
struct btrfs_key key;
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
@@ -836,41 +880,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- return ret;
+ goto out;
}
+ if (!info)
+ goto out;
+
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- if (size)
- *size = btrfs_inode_size(path->nodes[0], ii);
- if (gen)
- *gen = btrfs_inode_generation(path->nodes[0], ii);
- if (mode)
- *mode = btrfs_inode_mode(path->nodes[0], ii);
- if (uid)
- *uid = btrfs_inode_uid(path->nodes[0], ii);
- if (gid)
- *gid = btrfs_inode_gid(path->nodes[0], ii);
- if (rdev)
- *rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->size = btrfs_inode_size(path->nodes[0], ii);
+ info->gen = btrfs_inode_generation(path->nodes[0], ii);
+ info->mode = btrfs_inode_mode(path->nodes[0], ii);
+ info->uid = btrfs_inode_uid(path->nodes[0], ii);
+ info->gid = btrfs_inode_gid(path->nodes[0], ii);
+ info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
+ /*
+ * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
+ * otherwise logically split to 32/32 parts.
+ */
+ info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
+out:
+ btrfs_free_path(path);
return ret;
}
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
{
- struct btrfs_path *path;
int ret;
+ struct btrfs_inode_info info;
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
- ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
- rdev);
- btrfs_free_path(path);
+ if (!gen)
+ return -EPERM;
+
+ ret = get_inode_info(root, ino, &info);
+ if (!ret)
+ *gen = info.gen;
return ret;
}
@@ -891,7 +937,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iterate_inode_ref_t iterate, void *ctx)
{
struct extent_buffer *eb = path->nodes[0];
- struct btrfs_item *item;
struct btrfs_inode_ref *iref;
struct btrfs_inode_extref *extref;
struct btrfs_path *tmp_path;
@@ -923,12 +968,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
if (found_key->type == BTRFS_INODE_REF_KEY) {
ptr = (unsigned long)btrfs_item_ptr(eb, slot,
struct btrfs_inode_ref);
- item = btrfs_item_nr(slot);
- total = btrfs_item_size(eb, item);
+ total = btrfs_item_size(eb, slot);
elem_size = sizeof(*iref);
} else {
ptr = btrfs_item_ptr_offset(eb, slot);
- total = btrfs_item_size_nr(eb, slot);
+ total = btrfs_item_size(eb, slot);
elem_size = sizeof(*extref);
}
@@ -997,7 +1041,7 @@ out:
typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx);
+ void *ctx);
/*
* Helper function to iterate the entries in ONE btrfs_dir_item.
@@ -1011,7 +1055,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
{
int ret = 0;
struct extent_buffer *eb;
- struct btrfs_item *item;
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
@@ -1023,7 +1066,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
u32 total;
int slot;
int num;
- u8 type;
/*
* Start with a small buffer (1 page). If later we end up needing more
@@ -1040,20 +1082,18 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
eb = path->nodes[0];
slot = path->slots[0];
- item = btrfs_item_nr(slot);
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
cur = 0;
len = 0;
- total = btrfs_item_size(eb, item);
+ total = btrfs_item_size(eb, slot);
num = 0;
while (cur < total) {
name_len = btrfs_dir_name_len(eb, di);
data_len = btrfs_dir_data_len(eb, di);
- type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (type == BTRFS_FT_XATTR) {
+ if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@@ -1103,7 +1143,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
cur += len;
ret = iterate(num, &di_key, buf, name_len, buf + name_len,
- data_len, type, ctx);
+ data_len, ctx);
if (ret < 0)
goto out;
if (ret) {
@@ -1196,9 +1236,6 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
- /* data offset in the file extent item */
- u64 data_offset;
-
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1206,7 +1243,7 @@ struct backref_ctx {
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
{
u64 root = (u64)(uintptr_t)key;
- struct clone_root *cr = (struct clone_root *)elt;
+ const struct clone_root *cr = elt;
if (root < cr->root->root_key.objectid)
return -1;
@@ -1217,8 +1254,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
static int __clone_root_cmp_sort(const void *e1, const void *e2)
{
- struct clone_root *cr1 = (struct clone_root *)e1;
- struct clone_root *cr2 = (struct clone_root *)e2;
+ const struct clone_root *cr1 = e1;
+ const struct clone_root *cr2 = e2;
if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
@@ -1315,7 +1352,7 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx *backref_ctx = NULL;
+ struct backref_ctx backref_ctx = {0};
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
@@ -1330,12 +1367,6 @@ static int find_extent_clone(struct send_ctx *sctx,
/* We only use this path under the commit sem */
tmp_path->need_commit_sem = 0;
- backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
- if (!backref_ctx) {
- ret = -ENOMEM;
- goto out;
- }
-
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1400,25 +1431,12 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root->found_refs = 0;
}
- backref_ctx->sctx = sctx;
- backref_ctx->found = 0;
- backref_ctx->cur_objectid = ino;
- backref_ctx->cur_offset = data_offset;
- backref_ctx->found_itself = 0;
- backref_ctx->extent_len = num_bytes;
- /*
- * For non-compressed extents iterate_extent_inodes() gives us extent
- * offsets that already take into account the data offset, but not for
- * compressed extents, since the offset is logical and not relative to
- * the physical extent locations. We must take this into account to
- * avoid sending clone offsets that go beyond the source file's size,
- * which would result in the clone ioctl failing with -EINVAL on the
- * receiving end.
- */
- if (compressed == BTRFS_COMPRESS_NONE)
- backref_ctx->data_offset = 0;
- else
- backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
+ backref_ctx.sctx = sctx;
+ backref_ctx.found = 0;
+ backref_ctx.cur_objectid = ino;
+ backref_ctx.cur_offset = data_offset;
+ backref_ctx.found_itself = 0;
+ backref_ctx.extent_len = num_bytes;
/*
* The last extent of a file may be too large due to page alignment.
@@ -1426,7 +1444,7 @@ static int find_extent_clone(struct send_ctx *sctx,
* __iterate_backrefs work.
*/
if (data_offset + num_bytes >= ino_size)
- backref_ctx->extent_len = ino_size - data_offset;
+ backref_ctx.extent_len = ino_size - data_offset;
/*
* Now collect all backrefs.
@@ -1437,12 +1455,32 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = 0;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1, __iterate_backrefs,
- backref_ctx, false);
+ &backref_ctx, false);
if (ret < 0)
goto out;
- if (!backref_ctx->found_itself) {
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ /*
+ * A transaction commit for a transaction in which block group
+ * relocation was done just happened.
+ * The disk_bytenr of the file extent item we processed is
+ * possibly stale, referring to the extent's location before
+ * relocation. So act as if we haven't found any clone sources
+ * and fallback to write commands, which will read the correct
+ * data from the new extent location. Otherwise we will fail
+ * below because we haven't found our own back reference or we
+ * could be getting incorrect sources in case the old extent
+ * was already reallocated after the relocation.
+ */
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOENT;
+ goto out;
+ }
+ up_read(&fs_info->commit_root_sem);
+
+ if (!backref_ctx.found_itself) {
/* found a bug in backref code? */
ret = -EIO;
btrfs_err(fs_info,
@@ -1455,7 +1493,7 @@ static int find_extent_clone(struct send_ctx *sctx,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx->found)
+ if (!backref_ctx.found)
btrfs_debug(fs_info, "no clones found");
cur_clone_root = NULL;
@@ -1479,7 +1517,6 @@ static int find_extent_clone(struct send_ctx *sctx,
out:
btrfs_free_path(tmp_path);
- kfree(backref_ctx);
return ret;
}
@@ -1622,21 +1659,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
int right_ret;
u64 left_gen;
u64 right_gen;
+ struct btrfs_inode_info info;
- ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- left_ret = ret;
+ left_ret = (info.nlink == 0) ? -ENOENT : ret;
+ left_gen = info.gen;
if (!sctx->parent_root) {
right_ret = -ENOENT;
} else {
- ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- right_ret = ret;
+ right_ret = (info.nlink == 0) ? -ENOENT : ret;
+ right_gen = info.gen;
}
if (!left_ret && !right_ret) {
@@ -1708,8 +1746,7 @@ out:
*/
static int lookup_dir_item_inode(struct btrfs_root *root,
u64 dir, const char *name, int name_len,
- u64 *found_inode,
- u8 *found_type)
+ u64 *found_inode)
{
int ret = 0;
struct btrfs_dir_item *di;
@@ -1732,7 +1769,6 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
goto out;
}
*found_inode = key.objectid;
- *found_type = btrfs_dir_type(path->nodes[0], di);
out:
btrfs_free_path(path);
@@ -1797,8 +1833,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
btrfs_release_path(path);
if (dir_gen) {
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
goto out;
}
@@ -1855,7 +1890,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
int ret = 0;
u64 gen;
u64 other_inode = 0;
- u8 other_type = 0;
+ struct btrfs_inode_info info;
if (!sctx->parent_root)
goto out;
@@ -1870,8 +1905,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* and we can just unlink this entry.
*/
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1883,7 +1917,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
}
ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
- &other_inode, &other_type);
+ &other_inode);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1898,13 +1932,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
*/
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
- ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, who_mode, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, other_inode, &info);
if (ret < 0)
goto out;
ret = 1;
*who_ino = other_inode;
+ *who_gen = info.gen;
+ *who_mode = info.mode;
} else {
ret = 0;
}
@@ -1928,7 +1963,6 @@ static int did_overwrite_ref(struct send_ctx *sctx,
int ret = 0;
u64 gen;
u64 ow_inode;
- u8 other_type;
if (!sctx->parent_root)
goto out;
@@ -1938,8 +1972,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
if (dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1952,7 +1985,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
/* check if the ref was overwritten by another ref */
ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
- &ow_inode, &other_type);
+ &ow_inode);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1961,8 +1994,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
if (ret < 0)
goto out;
@@ -2099,16 +2131,6 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
}
/*
- * Removes the entry from the list and adds it back to the end. This marks the
- * entry as recently used so that name_cache_clean_unused does not remove it.
- */
-static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
-{
- list_del(&nce->list);
- list_add_tail(&nce->list, &sctx->name_cache_list);
-}
-
-/*
* Remove some entries from the beginning of name_cache_list.
*/
static void name_cache_clean_unused(struct send_ctx *sctx)
@@ -2168,7 +2190,13 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
kfree(nce);
nce = NULL;
} else {
- name_cache_used(sctx, nce);
+ /*
+ * Removes the entry from the list and adds it back to
+ * the end. This marks the entry as recently used so
+ * that name_cache_clean_unused does not remove it.
+ */
+ list_move_tail(&nce->list, &sctx->name_cache_list);
+
*parent_ino = nce->parent_ino;
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
@@ -2182,7 +2210,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
/*
* If the inode is not existent yet, add the orphan name and return 1.
* This should only happen for the parent dir that we determine in
- * __record_new_ref
+ * record_new_ref_if_needed().
*/
ret = is_inode_existent(sctx, ino, gen);
if (ret < 0)
@@ -2305,7 +2333,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(name);
- if (is_waiting_for_rm(sctx, ino)) {
+ if (is_waiting_for_rm(sctx, ino, gen)) {
ret = gen_unique_name(sctx, ino, gen, name);
if (ret < 0)
goto out;
@@ -2416,7 +2444,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
sctx->send_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
- le64_to_cpu(sctx->send_root->root_item.ctransid));
+ btrfs_root_ctransid(&sctx->send_root->root_item));
if (parent_root) {
if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
@@ -2425,7 +2453,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- le64_to_cpu(sctx->parent_root->root_item.ctransid));
+ btrfs_root_ctransid(&sctx->parent_root->root_item));
}
ret = send_cmd(sctx);
@@ -2497,6 +2525,39 @@ out:
return ret;
}
+static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ if (sctx->proto < 2)
+ return 0;
+
+ btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
@@ -2576,7 +2637,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
- /* TODO Add otime support when the otime patches get into upstream */
+ if (sctx->proto >= 2)
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
ret = send_cmd(sctx);
@@ -2598,6 +2660,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
int ret = 0;
struct fs_path *p;
int cmd;
+ struct btrfs_inode_info info;
u64 gen;
u64 mode;
u64 rdev;
@@ -2609,10 +2672,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
if (ino != sctx->cur_ino) {
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
- NULL, NULL, &rdev);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0)
goto out;
+ gen = info.gen;
+ mode = info.mode;
+ rdev = info.rdev;
} else {
gen = sctx->cur_inode_gen;
mode = sctx->cur_inode_mode;
@@ -2680,61 +2745,43 @@ out:
static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
- struct extent_buffer *eb;
struct btrfs_dir_item *di;
- int slot;
path = alloc_path_for_send();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->send_root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
+ btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *eb = path->nodes[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
- goto out;
+ break;
}
-
- path->slots[0]++;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -2752,19 +2799,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
if (S_ISDIR(sctx->cur_inode_mode)) {
ret = did_create_dir(sctx, sctx->cur_ino);
if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ return ret;
+ else if (ret > 0)
+ return 0;
}
- ret = send_create_inode(sctx, sctx->cur_ino);
- if (ret < 0)
- goto out;
-
-out:
- return ret;
+ return send_create_inode(sctx, sctx->cur_ino);
}
struct recorded_ref {
@@ -2774,48 +2814,50 @@ struct recorded_ref {
u64 dir;
u64 dir_gen;
int name_len;
+ struct rb_node node;
+ struct rb_root *root;
};
-static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+static struct recorded_ref *recorded_ref_alloc(void)
{
- ref->full_path = path;
- ref->name = (char *)kbasename(ref->full_path->start);
- ref->name_len = ref->full_path->end - ref->name;
+ struct recorded_ref *ref;
+
+ ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!ref)
+ return NULL;
+ RB_CLEAR_NODE(&ref->node);
+ INIT_LIST_HEAD(&ref->list);
+ return ref;
}
-/*
- * We need to process new refs before deleted refs, but compare_tree gives us
- * everything mixed. So we first record all refs and later process them.
- * This function is a helper to record one ref.
- */
-static int __record_ref(struct list_head *head, u64 dir,
- u64 dir_gen, struct fs_path *path)
+static void recorded_ref_free(struct recorded_ref *ref)
{
- struct recorded_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_KERNEL);
if (!ref)
- return -ENOMEM;
+ return;
+ if (!RB_EMPTY_NODE(&ref->node))
+ rb_erase(&ref->node, ref->root);
+ list_del(&ref->list);
+ fs_path_free(ref->full_path);
+ kfree(ref);
+}
- ref->dir = dir;
- ref->dir_gen = dir_gen;
- set_ref_path(ref, path);
- list_add_tail(&ref->list, head);
- return 0;
+static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+{
+ ref->full_path = path;
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
}
static int dup_ref(struct recorded_ref *ref, struct list_head *list)
{
struct recorded_ref *new;
- new = kmalloc(sizeof(*ref), GFP_KERNEL);
+ new = recorded_ref_alloc();
if (!new)
return -ENOMEM;
new->dir = ref->dir;
new->dir_gen = ref->dir_gen;
- new->full_path = NULL;
- INIT_LIST_HEAD(&new->list);
list_add_tail(&new->list, list);
return 0;
}
@@ -2826,9 +2868,7 @@ static void __free_recorded_refs(struct list_head *head)
while (!list_empty(head)) {
cur = list_entry(head->next, struct recorded_ref, list);
- fs_path_free(cur->full_path);
- list_del(&cur->list);
- kfree(cur);
+ recorded_ref_free(cur);
}
}
@@ -2864,8 +2904,8 @@ out:
return ret;
}
-static struct orphan_dir_info *
-add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
+ u64 dir_ino, u64 dir_gen)
{
struct rb_node **p = &sctx->orphan_dirs.rb_node;
struct rb_node *parent = NULL;
@@ -2874,20 +2914,23 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
while (*p) {
parent = *p;
entry = rb_entry(parent, struct orphan_dir_info, node);
- if (dir_ino < entry->ino) {
+ if (dir_ino < entry->ino)
p = &(*p)->rb_left;
- } else if (dir_ino > entry->ino) {
+ else if (dir_ino > entry->ino)
p = &(*p)->rb_right;
- } else {
+ else if (dir_gen < entry->gen)
+ p = &(*p)->rb_left;
+ else if (dir_gen > entry->gen)
+ p = &(*p)->rb_right;
+ else
return entry;
- }
}
odi = kmalloc(sizeof(*odi), GFP_KERNEL);
if (!odi)
return ERR_PTR(-ENOMEM);
odi->ino = dir_ino;
- odi->gen = 0;
+ odi->gen = dir_gen;
odi->last_dir_index_offset = 0;
rb_link_node(&odi->node, parent, p);
@@ -2895,8 +2938,8 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
return odi;
}
-static struct orphan_dir_info *
-get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
+ u64 dir_ino, u64 gen)
{
struct rb_node *n = sctx->orphan_dirs.rb_node;
struct orphan_dir_info *entry;
@@ -2907,15 +2950,19 @@ get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
n = n->rb_left;
else if (dir_ino > entry->ino)
n = n->rb_right;
+ else if (gen < entry->gen)
+ n = n->rb_left;
+ else if (gen > entry->gen)
+ n = n->rb_right;
else
return entry;
}
return NULL;
}
-static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
{
- struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
return odi != NULL;
}
@@ -2938,6 +2985,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
u64 send_progress)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root = sctx->parent_root;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2960,27 +3008,13 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- odi = get_orphan_dir_info(sctx, dir);
+ odi = get_orphan_dir_info(sctx, dir, dir_gen);
if (odi)
key.offset = odi->last_dir_index_offset;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid ||
found_key.type != key.type)
break;
@@ -2991,7 +3025,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
dm = get_waiting_dir_move(sctx, loc.objectid);
if (dm) {
- odi = add_orphan_dir_info(sctx, dir);
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
if (IS_ERR(odi)) {
ret = PTR_ERR(odi);
goto out;
@@ -2999,12 +3033,13 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
odi->gen = dir_gen;
odi->last_dir_index_offset = found_key.offset;
dm->rmdir_ino = dir;
+ dm->rmdir_gen = dir_gen;
ret = 0;
goto out;
}
if (loc.objectid > send_progress) {
- odi = add_orphan_dir_info(sctx, dir);
+ odi = add_orphan_dir_info(sctx, dir, dir_gen);
if (IS_ERR(odi)) {
ret = PTR_ERR(odi);
goto out;
@@ -3014,8 +3049,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
ret = 0;
goto out;
}
-
- path->slots[0]++;
+ }
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
free_orphan_dir_info(sctx, odi);
@@ -3044,6 +3081,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->rmdir_gen = 0;
dm->orphanized = orphanized;
while (*p) {
@@ -3189,7 +3227,7 @@ static int path_loop(struct send_ctx *sctx, struct fs_path *name,
while (ino != BTRFS_FIRST_FREE_OBJECTID) {
fs_path_reset(name);
- if (is_waiting_for_rm(sctx, ino))
+ if (is_waiting_for_rm(sctx, ino, gen))
break;
if (is_waiting_for_move(sctx, ino)) {
if (*ancestor_ino == 0)
@@ -3229,6 +3267,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
u64 parent_ino, parent_gen;
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
+ u64 rmdir_gen;
u64 ancestor;
bool is_orphan;
int ret;
@@ -3243,6 +3282,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
rmdir_ino = dm->rmdir_ino;
+ rmdir_gen = dm->rmdir_gen;
is_orphan = dm->orphanized;
free_waiting_dir_move(sctx, dm);
@@ -3279,6 +3319,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
dm->rmdir_ino = rmdir_ino;
+ dm->rmdir_gen = rmdir_gen;
}
goto out;
}
@@ -3297,7 +3338,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
struct orphan_dir_info *odi;
u64 gen;
- odi = get_orphan_dir_info(sctx, rmdir_ino);
+ odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
if (!odi) {
/* already deleted */
goto finish;
@@ -3336,8 +3377,7 @@ finish:
/*
* The parent inode might have been deleted in the send snapshot
*/
- ret = get_inode_info(sctx->send_root, cur->dir, NULL,
- NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL);
if (ret == -ENOENT) {
ret = 0;
continue;
@@ -3511,12 +3551,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
- &left_gen, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
goto out;
- ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
- &right_gen, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -3579,7 +3617,7 @@ static int check_ino_in_path(struct btrfs_root *root,
}
/*
- * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
+ * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
* possible path (in case ino2 is not a directory and has multiple hard links).
* Return 1 if true, 0 if false and < 0 on error.
*/
@@ -3591,6 +3629,7 @@ static int is_ancestor(struct btrfs_root *root,
{
bool free_fs_path = false;
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -3611,33 +3650,19 @@ static int is_ancestor(struct btrfs_root *root,
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (true) {
+ btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u32 cur_offset = 0;
u32 item_size;
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino2)
break;
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
while (cur_offset < item_size) {
u64 parent;
u64 parent_gen;
@@ -3659,8 +3684,7 @@ static int is_ancestor(struct btrfs_root *root,
cur_offset = item_size;
}
- ret = get_inode_info(root, parent, NULL, &parent_gen,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent, &parent_gen);
if (ret < 0)
goto out;
ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3668,10 +3692,12 @@ static int is_ancestor(struct btrfs_root *root,
if (ret)
goto out;
}
- path->slots[0]++;
}
ret = 0;
- out:
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+out:
btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
@@ -3748,9 +3774,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
memcmp(path_before->start, path_after->start, len1))) {
u64 parent_ino_gen;
- ret = get_inode_info(sctx->parent_root, ino, NULL,
- &parent_ino_gen, NULL, NULL, NULL,
- NULL);
+ ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
if (ret < 0)
goto out;
if (ino_gen == parent_ino_gen) {
@@ -3812,6 +3836,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
}
/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
* This does all the move/link/unlink/rmdir magic.
*/
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
@@ -3879,52 +3969,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * than the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -3941,6 +4035,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct name_cache_entry *nce;
struct waiting_dir_move *wdm;
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
@@ -3997,12 +4097,66 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
} else {
+ /*
+ * If we previously orphanized a directory that
+ * collided with a new reference that we already
+ * processed, recompute the current path because
+ * that directory may be part of the path.
+ */
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
}
}
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
if (ret < 0)
@@ -4214,185 +4368,167 @@ out:
return ret;
}
-static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
- void *ctx, struct list_head *refs)
+static int rbtree_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+ int result;
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ if (data->name_len > ref->name_len)
+ return 1;
+ if (data->name_len < ref->name_len)
+ return -1;
+ result = strcmp(data->name, ref->name);
+ if (result > 0)
+ return 1;
+ if (result < 0)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_ref_comp(entry, parent) < 0;
+}
+
+static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
+ struct fs_path *name, u64 dir, u64 dir_gen,
+ struct send_ctx *sctx)
{
int ret = 0;
- struct send_ctx *sctx = ctx;
- struct fs_path *p;
- u64 gen;
+ struct fs_path *path = NULL;
+ struct recorded_ref *ref = NULL;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ path = fs_path_alloc();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
+ ref = recorded_ref_alloc();
+ if (!ref) {
+ ret = -ENOMEM;
goto out;
+ }
- ret = get_cur_path(sctx, dir, gen, p);
+ ret = get_cur_path(sctx, dir, dir_gen, path);
if (ret < 0)
goto out;
- ret = fs_path_add_path(p, name);
+ ret = fs_path_add_path(path, name);
if (ret < 0)
goto out;
- ret = __record_ref(refs, dir, gen, p);
-
+ ref->dir = dir;
+ ref->dir_gen = dir_gen;
+ set_ref_path(ref, path);
+ list_add_tail(&ref->list, refs);
+ rb_add(&ref->node, root, rbtree_ref_less);
+ ref->root = root;
out:
- if (ret)
- fs_path_free(p);
+ if (ret) {
+ if (path && (!ref || !ref->full_path))
+ fs_path_free(path);
+ recorded_ref_free(ref);
+ }
return ret;
}
-static int __record_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
-{
- struct send_ctx *sctx = ctx;
- return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
-}
-
-
-static int __record_deleted_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_new_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
{
+ int ret = 0;
struct send_ctx *sctx = ctx;
- return record_ref(sctx->parent_root, dir, name, ctx,
- &sctx->deleted_refs);
-}
-
-static int record_new_ref(struct send_ctx *sctx)
-{
- int ret;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_new_ref, sctx);
+ ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
goto out;
- ret = 0;
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_new_refs,
+ &sctx->new_refs, name, dir, dir_gen,
+ sctx);
+ }
out:
return ret;
}
-static int record_deleted_ref(struct send_ctx *sctx)
+static int record_deleted_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
{
- int ret;
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
goto out;
- ret = 0;
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
+ &sctx->deleted_refs, name, dir,
+ dir_gen, sctx);
+ }
out:
return ret;
}
-struct find_ref_ctx {
- u64 dir;
- u64 dir_gen;
- struct btrfs_root *root;
- struct fs_path *name;
- int found_idx;
-};
-
-static int __find_iref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx_)
-{
- struct find_ref_ctx *ctx = ctx_;
- u64 dir_gen;
- int ret;
-
- if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
- strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
- /*
- * To avoid doing extra lookups we'll only do this if everything
- * else matches.
- */
- ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
- if (dir_gen != ctx->dir_gen)
- return 0;
- ctx->found_idx = num;
- return 1;
- }
- return 0;
-}
-
-static int find_iref(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key,
- u64 dir, u64 dir_gen, struct fs_path *name)
+static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- struct find_ref_ctx ctx;
-
- ctx.dir = dir;
- ctx.name = name;
- ctx.dir_gen = dir_gen;
- ctx.found_idx = -1;
- ctx.root = root;
- ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- return ret;
-
- if (ctx.found_idx == -1)
- return -ENOENT;
-
- return ctx.found_idx;
-}
-
-static int __record_changed_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
-{
- u64 dir_gen;
- int ret;
- struct send_ctx *sctx = ctx;
-
- ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
-
- ret = find_iref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, dir, dir_gen, name);
- if (ret == -ENOENT)
- ret = __record_new_ref(num, dir, index, name, sctx);
- else if (ret > 0)
- ret = 0;
+ goto out;
+ ret = 0;
+out:
return ret;
}
-static int __record_changed_deleted_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_deleted_ref(struct send_ctx *sctx)
{
- u64 dir_gen;
int ret;
- struct send_ctx *sctx = ctx;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
-
- ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
- dir, dir_gen, name);
- if (ret == -ENOENT)
- ret = __record_deleted_ref(num, dir, index, name, sctx);
- else if (ret > 0)
- ret = 0;
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, record_deleted_ref_if_needed,
+ sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+out:
return ret;
}
@@ -4401,11 +4537,11 @@ static int record_changed_ref(struct send_ctx *sctx)
int ret = 0;
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
goto out;
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+ sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -4421,13 +4557,12 @@ out:
static int process_all_refs(struct send_ctx *sctx,
enum btrfs_compare_tree_result cmd)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
iterate_inode_ref_t cb;
int pending_move = 0;
@@ -4437,10 +4572,10 @@ static int process_all_refs(struct send_ctx *sctx,
if (cmd == BTRFS_COMPARE_TREE_NEW) {
root = sctx->send_root;
- cb = __record_new_ref;
+ cb = record_new_ref_if_needed;
} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
root = sctx->parent_root;
- cb = __record_deleted_ref;
+ cb = record_deleted_ref_if_needed;
} else {
btrfs_err(sctx->send_root->fs_info,
"Wrong command %d in process_all_refs", cmd);
@@ -4451,24 +4586,7 @@ static int process_all_refs(struct send_ctx *sctx,
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
(found_key.type != BTRFS_INODE_REF_KEY &&
found_key.type != BTRFS_INODE_EXTREF_KEY))
@@ -4477,8 +4595,11 @@ static int process_all_refs(struct send_ctx *sctx,
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
if (ret < 0)
goto out;
-
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
btrfs_release_path(path);
@@ -4536,15 +4657,18 @@ out:
}
static int __process_new_xattr(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
+ const char *name, int name_len, const char *data,
+ int data_len, void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
+ /* Capabilities are emitted by finish_inode_if_needed */
+ if (!strncmp(name, XATTR_NAME_CAPS, name_len))
+ return 0;
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -4578,8 +4702,7 @@ out:
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *ctx)
+ const char *data, int data_len, void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4624,10 +4747,8 @@ struct find_xattr_ctx {
int found_data_len;
};
-static int __find_xattr(int num, struct btrfs_key *di_key,
- const char *name, int name_len,
- const char *data, int data_len,
- u8 type, void *vctx)
+static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
+ int name_len, const char *data, int data_len, void *vctx)
{
struct find_xattr_ctx *ctx = vctx;
@@ -4677,7 +4798,7 @@ static int find_xattr(struct btrfs_root *root,
static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx)
+ void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4689,12 +4810,12 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
&found_data_len);
if (ret == -ENOENT) {
ret = __process_new_xattr(num, di_key, name, name_len, data,
- data_len, type, ctx);
+ data_len, ctx);
} else if (ret >= 0) {
if (data_len != found_data_len ||
memcmp(data, found_data, data_len)) {
ret = __process_new_xattr(num, di_key, name, name_len,
- data, data_len, type, ctx);
+ data, data_len, ctx);
} else {
ret = 0;
}
@@ -4707,7 +4828,7 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len,
- u8 type, void *ctx)
+ void *ctx)
{
int ret;
struct send_ctx *sctx = ctx;
@@ -4716,7 +4837,7 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
name, name_len, NULL, NULL);
if (ret == -ENOENT)
ret = __process_deleted_xattr(num, di_key, name, name_len, data,
- data_len, type, ctx);
+ data_len, ctx);
else if (ret >= 0)
ret = 0;
@@ -4740,13 +4861,12 @@ out:
static int process_all_new_xattrs(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
@@ -4757,124 +4877,199 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static int send_verity(struct send_ctx *sctx, struct fs_path *path,
+ struct fsverity_descriptor *desc)
+{
+ int ret;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
+ le8_to_cpu(desc->hash_algorithm));
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
+ 1U << le8_to_cpu(desc->log_blocksize));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
+ le8_to_cpu(desc->salt_size));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
+ le32_to_cpu(desc->sig_size));
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int process_verity(struct send_ctx *sctx)
+{
+ int ret = 0;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct inode *inode;
+ struct fs_path *p;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ if (ret < 0)
+ goto iput;
+
+ if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ ret = -EMSGSIZE;
+ goto iput;
+ }
+ if (!sctx->verity_descriptor) {
+ sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
+ GFP_KERNEL);
+ if (!sctx->verity_descriptor) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ }
+
+ ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ if (ret < 0)
+ goto iput;
+
+ p = fs_path_alloc();
+ if (!p) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto free_path;
+
+ ret = send_verity(sctx, p, sctx->verity_descriptor);
+ if (ret < 0)
+ goto free_path;
+
+free_path:
+ fs_path_free(p);
+iput:
+ iput(inode);
+ return ret;
+}
+
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+ sctx->put_data = true;
+ if (sctx->proto >= 2) {
+ /*
+ * Since v2, the data attribute header doesn't include a length,
+ * it is implicitly to the end of the command.
+ */
+ if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ return -EOVERFLOW;
+ put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
+ sctx->send_size += sizeof(__le16);
+ } else {
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ }
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
struct page *page;
- char *addr;
- struct btrfs_key key;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
-
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ int ret;
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
last_index = (offset + len - 1) >> PAGE_SHIFT;
- /* initial readahead */
- memset(&sctx->ra, 0, sizeof(struct file_ra_state));
- file_ra_state_init(&sctx->ra, inode->i_mapping);
-
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(inode->i_mapping, index);
+ page = find_lock_page(sctx->cur_inode->i_mapping, index);
if (!page) {
- page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
- NULL, index, last_index + 1 - index);
+ page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, index,
+ last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- GFP_KERNEL);
+ page = find_or_create_page(sctx->cur_inode->i_mapping,
+ index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
break;
}
}
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping, &sctx->ra,
- NULL, page, index, last_index + 1 - index);
- }
+ if (PageReadahead(page))
+ page_cache_async_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, page_folio(page),
+ index, last_index + 1 - index);
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
}
}
- addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
- kunmap(page);
+ memcpy_from_page(sctx->send_buf + sctx->send_size, page,
+ pg_offset, cur_len);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
- iput(inode);
+
return ret;
}
@@ -4887,7 +5082,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -4895,13 +5089,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -4912,16 +5099,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -4957,8 +5144,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
if (clone_root->root == sctx->send_root) {
- ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -4984,7 +5170,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- le64_to_cpu(clone_root->root->root_item.ctransid));
+ btrfs_root_ctransid(&clone_root->root->root_item));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
clone_root->offset);
@@ -5033,8 +5219,8 @@ out:
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5061,16 +5247,19 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5082,41 +5271,359 @@ tlv_put_failure:
return ret;
}
-static int send_extent_data(struct send_ctx *sctx,
- const u64 offset,
- const u64 len)
+static int send_encoded_inline_extent(struct send_ctx *sctx,
+ struct btrfs_path *path, u64 offset,
+ u64 len)
{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 ram_bytes;
+ size_t inline_size;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + ram_bytes - offset, len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+
+ ret = put_data_header(sctx, inline_size);
+ if (ret < 0)
+ goto out;
+ read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
+ btrfs_file_extent_inline_start(ei), inline_size);
+ sctx->send_size += inline_size;
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
+ u64 offset, u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 disk_bytenr, disk_num_bytes;
+ u32 data_offset;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
+ len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
+ btrfs_file_extent_ram_bytes(leaf, ei));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
+ offset - key.offset + btrfs_file_extent_offset(leaf, ei));
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
+
+ ret = put_data_header(sctx, disk_num_bytes);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We want to do I/O directly into the send buffer, so get the next page
+ * boundary in the send buffer. This means that there may be a gap
+ * between the beginning of the command and the file data.
+ */
+ data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ if (data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ /*
+ * Note that send_buf is a mapping of send_buf_pages, so this is really
+ * reading into send_buf.
+ */
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ disk_bytenr, disk_num_bytes,
+ sctx->send_buf_pages +
+ (data_offset >> PAGE_SHIFT));
+ if (ret)
+ goto out;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
+ hdr->crc = 0;
+ crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ hdr->crc = cpu_to_le32(crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+ if (!ret) {
+ ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
+ disk_num_bytes, &sctx->send_off);
+ }
+ sctx->send_size = 0;
+ sctx->put_data = false;
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
+ const u64 offset, const u64 len)
+{
+ const u64 end = offset + len;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+ btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
+ bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_INLINE);
+
+ /*
+ * Send the compressed extent unless the compressed data is
+ * larger than the decompressed data. This can happen if we're
+ * not sending the entire extent, either because it has been
+ * partially overwritten/truncated or because this is a part of
+ * the extent that we couldn't clone in clone_range().
+ */
+ if (is_inline &&
+ btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]) <= len) {
+ return send_encoded_inline_extent(sctx, path, offset,
+ len);
+ } else if (!is_inline &&
+ btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
+ return send_encoded_extent(sctx, path, offset, len);
+ }
+ }
+
+ if (sctx->cur_inode == NULL) {
+ struct btrfs_root *root = sctx->send_root;
+
+ sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(sctx->cur_inode)) {
+ int err = PTR_ERR(sctx->cur_inode);
+
+ sctx->cur_inode = NULL;
+ return err;
+ }
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+ /*
+ * It's very likely there are no pages from this inode in the page
+ * cache, so after reading extents and sending their data, we clean
+ * the page cache to avoid trashing the page cache (adding pressure
+ * to the page cache and forcing eviction of other data more useful
+ * for applications).
+ *
+ * We decide if we should clean the page cache simply by checking
+ * if the inode's mapping nrpages is 0 when we first open it, and
+ * not by using something like filemap_range_has_page() before
+ * reading an extent because when we ask the readahead code to
+ * read a given file range, it may (and almost always does) read
+ * pages from beyond that range (see the documentation for
+ * page_cache_sync_readahead()), so it would not be reliable,
+ * because after reading the first extent future calls to
+ * filemap_range_has_page() would return true because the readahead
+ * on the previous extent resulted in reading pages of the current
+ * extent as well.
+ */
+ sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+ sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
+ }
+
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
+
+ if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ /*
+ * Always operate only on ranges that are a multiple of the page
+ * size. This is not only to prevent zeroing parts of a page in
+ * the case of subpage sector size, but also to guarantee we evict
+ * pages, as passing a range that is smaller than page size does
+ * not evict the respective page (only zeroes part of its content).
+ *
+ * Always start from the end offset of the last range cleared.
+ * This is because the readahead code may (and very often does)
+ * reads pages beyond the range we request for readahead. So if
+ * we have an extent layout like this:
+ *
+ * [ extent A ] [ extent B ] [ extent C ]
+ *
+ * When we ask page_cache_sync_readahead() to read extent A, it
+ * may also trigger reads for pages of extent B. If we are doing
+ * an incremental send and extent B has not changed between the
+ * parent and send snapshots, some or all of its pages may end
+ * up being read and placed in the page cache. So when truncating
+ * the page cache we always start from the end offset of the
+ * previously processed extent up to the end of the current
+ * extent.
+ */
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ end - 1);
+ sctx->page_cache_clear_start = end;
+ }
+
return 0;
}
-static int clone_range(struct send_ctx *sctx,
- struct clone_root *clone_root,
- const u64 disk_byte,
- u64 data_offset,
- u64 offset,
- u64 len)
+/*
+ * Search for a capability xattr related to sctx->cur_ino. If the capability is
+ * found, call send_set_xattr function to emit it.
+ *
+ * Return 0 if there isn't a capability, or when the capability was emitted
+ * successfully, or < 0 if an error occurred.
+ */
+static int send_capabilities(struct send_ctx *sctx)
+{
+ struct fs_path *fspath = NULL;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ unsigned long data_ptr;
+ char *buf = NULL;
+ int buf_len;
+ int ret = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
+ XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
+ if (!di) {
+ /* There is no xattr for this inode */
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ buf_len = btrfs_dir_data_len(leaf, di);
+
+ fspath = fs_path_alloc();
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!fspath || !buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
+ read_extent_buffer(leaf, buf, data_ptr, buf_len);
+
+ ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ strlen(XATTR_NAME_CAPS), buf, buf_len);
+out:
+ kfree(buf);
+ fs_path_free(fspath);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
+ struct clone_root *clone_root, const u64 disk_byte,
+ u64 data_offset, u64 offset, u64 len)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ struct btrfs_inode_info info;
u64 clone_src_i_size = 0;
/*
@@ -5136,7 +5643,7 @@ static int clone_range(struct send_ctx *sctx,
*/
if (clone_root->offset == 0 &&
len == sctx->send_root->fs_info->sectorsize)
- return send_extent_data(sctx, offset, len);
+ return send_extent_data(sctx, dst_path, offset, len);
path = alloc_path_for_send();
if (!path)
@@ -5146,11 +5653,11 @@ static int clone_range(struct send_ctx *sctx,
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = __get_inode_info(clone_root->root, path, clone_root->ino,
- &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
goto out;
+ clone_src_i_size = info.size;
/*
* We can't send a clone operation for the entire range if we find
@@ -5233,7 +5740,8 @@ static int clone_range(struct send_ctx *sctx,
if (hole_len > len)
hole_len = len;
- ret = send_extent_data(sctx, offset, hole_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ hole_len);
if (ret < 0)
goto out;
@@ -5306,14 +5814,16 @@ static int clone_range(struct send_ctx *sctx,
if (ret < 0)
goto out;
}
- ret = send_extent_data(sctx, offset + slen,
+ ret = send_extent_data(sctx, dst_path,
+ offset + slen,
clone_len - slen);
} else {
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
} else {
- ret = send_extent_data(sctx, offset, clone_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ clone_len);
}
if (ret < 0)
@@ -5324,13 +5834,28 @@ static int clone_range(struct send_ctx *sctx,
break;
offset += clone_len;
clone_root->offset += clone_len;
+
+ /*
+ * If we are cloning from the file we are currently processing,
+ * and using the send root as the clone root, we must stop once
+ * the current clone offset reaches the current eof of the file
+ * at the receiver, otherwise we would issue an invalid clone
+ * operation (source range going beyond eof) and cause the
+ * receiver to fail. So if we reach the current eof, bail out
+ * and fallback to a regular write.
+ */
+ if (clone_root->root == sctx->send_root &&
+ clone_root->ino == sctx->cur_ino &&
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
+ break;
+
data_offset += clone_len;
next:
path->slots[0]++;
}
if (len > 0)
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
out:
@@ -5344,51 +5869,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
-
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
- ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ ret = clone_range(sctx, path, clone_root, disk_byte,
+ data_offset, offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, path, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -5586,10 +6089,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
struct btrfs_path *path;
struct btrfs_root *root = sctx->send_root;
- struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 extent_end;
- u8 type;
int ret;
path = alloc_path_for_send();
@@ -5609,18 +6109,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
goto out;
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key.offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
out:
btrfs_free_path(path);
return ret;
@@ -5674,16 +6163,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
break;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
-
- extent_end = ALIGN(key.offset + size,
- root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- }
+ extent_end = btrfs_file_extent_end(path);
if (extent_end <= start)
goto next;
if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
@@ -5704,9 +6184,6 @@ out:
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_file_extent_item *fi;
- u64 extent_end;
- u8 type;
int ret = 0;
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
@@ -5718,18 +6195,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
return ret;
}
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key->offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key->offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
-
if (path->slots[0] == 0 &&
sctx->cur_inode_last_extent < key->offset) {
/*
@@ -5755,7 +6220,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
else
ret = 0;
}
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
return ret;
}
@@ -5821,13 +6286,12 @@ out:
static int process_all_extents(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
root = sctx->send_root;
path = alloc_path_for_send();
@@ -5837,41 +6301,21 @@ static int process_all_extents(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = process_extent(sctx, path, &found_key);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -5902,14 +6346,18 @@ out:
static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
{
int ret = 0;
+ struct btrfs_inode_info info;
u64 left_mode;
u64 left_uid;
u64 left_gid;
+ u64 left_fileattr;
u64 right_mode;
u64 right_uid;
u64 right_gid;
+ u64 right_fileattr;
int need_chmod = 0;
int need_chown = 0;
+ bool need_fileattr = false;
int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
@@ -5941,11 +6389,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
goto out;
-
- ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid, NULL);
+ ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ left_mode = info.mode;
+ left_uid = info.uid;
+ left_gid = info.gid;
+ left_fileattr = info.fileattr;
if (!sctx->parent_root || sctx->cur_inode_new) {
need_chown = 1;
@@ -5956,16 +6406,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
} else {
u64 old_size;
- ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
- &old_size, NULL, &right_mode, &right_uid,
- &right_gid, NULL);
+ ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ old_size = info.size;
+ right_mode = info.mode;
+ right_uid = info.uid;
+ right_gid = info.gid;
+ right_fileattr = info.fileattr;
if (left_uid != right_uid || left_gid != right_gid)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
+ need_fileattr = true;
if ((old_size == sctx->cur_inode_size) ||
(sctx->cur_inode_size > old_size &&
sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
@@ -6009,6 +6464,23 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret < 0)
goto out;
}
+ if (need_fileattr) {
+ ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_fileattr);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
+ && sctx->cur_inode_needs_verity) {
+ ret = process_verity(sctx);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = send_capabilities(sctx);
+ if (ret < 0)
+ goto out;
/*
* If other directory inodes depended on our current directory
@@ -6035,91 +6507,28 @@ out:
return ret;
}
-struct parent_paths_ctx {
- struct list_head *refs;
- struct send_ctx *sctx;
-};
-
-static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
- void *ctx)
-{
- struct parent_paths_ctx *ppctx = ctx;
-
- return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
- ppctx->refs);
-}
-
-/*
- * Issue unlink operations for all paths of the current inode found in the
- * parent snapshot.
- */
-static int btrfs_unlink_all_paths(struct send_ctx *sctx)
+static void close_current_inode(struct send_ctx *sctx)
{
- LIST_HEAD(deleted_refs);
- struct btrfs_path *path;
- struct btrfs_key key;
- struct parent_paths_ctx ctx;
- int ret;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- ctx.refs = &deleted_refs;
- ctx.sctx = sctx;
-
- while (true) {
- struct extent_buffer *eb = path->nodes[0];
- int slot = path->slots[0];
+ u64 i_size;
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->parent_root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.objectid != sctx->cur_ino)
- break;
- if (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)
- break;
-
- ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
- record_parent_ref, &ctx);
- if (ret < 0)
- goto out;
+ if (sctx->cur_inode == NULL)
+ return;
- path->slots[0]++;
- }
+ i_size = i_size_read(sctx->cur_inode);
- while (!list_empty(&deleted_refs)) {
- struct recorded_ref *ref;
+ /*
+ * If we are doing an incremental send, we may have extents between the
+ * last processed extent and the i_size that have not been processed
+ * because they haven't changed but we may have read some of their pages
+ * through readahead, see the comments at send_extent_data().
+ */
+ if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ round_up(i_size, PAGE_SIZE) - 1);
- ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
- ret = send_unlink(sctx, ref->full_path);
- if (ret < 0)
- goto out;
- fs_path_free(ref->full_path);
- list_del(&ref->list);
- kfree(ref);
- }
- ret = 0;
-out:
- btrfs_free_path(path);
- if (ret)
- __free_recorded_refs(&deleted_refs);
- return ret;
+ iput(sctx->cur_inode);
+ sctx->cur_inode = NULL;
}
static int changed_inode(struct send_ctx *sctx,
@@ -6132,8 +6541,10 @@ static int changed_inode(struct send_ctx *sctx,
u64 left_gen = 0;
u64 right_gen = 0;
+ close_current_inode(sctx);
+
sctx->cur_ino = key->objectid;
- sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_new_gen = false;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
@@ -6174,7 +6585,7 @@ static int changed_inode(struct send_ctx *sctx,
*/
if (left_gen != right_gen &&
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
- sctx->cur_inode_new_gen = 1;
+ sctx->cur_inode_new_gen = true;
}
/*
@@ -6186,28 +6597,39 @@ static int changed_inode(struct send_ctx *sctx,
* file descriptor against it or turning a RO snapshot into RW mode,
* keep an open file descriptor against a file, delete it and then
* turn the snapshot back to RO mode before using it for a send
- * operation. So if we find such cases, ignore the inode and all its
- * items completely if it's a new inode, or if it's a changed inode
- * make sure all its previous paths (from the parent snapshot) are all
- * unlinked and all other the inode items are ignored.
+ * operation. The former is what the receiver operation does.
+ * Therefore, if we want to send these snapshots soon after they're
+ * received, we need to handle orphan inodes as well. Moreover, orphans
+ * can appear not only in the send snapshot but also in the parent
+ * snapshot. Here are several cases:
+ *
+ * Case 1: BTRFS_COMPARE_TREE_NEW
+ * | send snapshot | action
+ * --------------------------------
+ * nlink | 0 | ignore
+ *
+ * Case 2: BTRFS_COMPARE_TREE_DELETED
+ * | parent snapshot | action
+ * ----------------------------------
+ * nlink | 0 | as usual
+ * Note: No unlinks will be sent because there're no paths for it.
+ *
+ * Case 3: BTRFS_COMPARE_TREE_CHANGED
+ * | | parent snapshot | send snapshot | action
+ * -----------------------------------------------------------------------
+ * subcase 1 | nlink | 0 | 0 | ignore
+ * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
+ * subcase 3 | nlink | 0 | >0 | new_gen(creation)
+ *
*/
- if (result == BTRFS_COMPARE_TREE_NEW ||
- result == BTRFS_COMPARE_TREE_CHANGED) {
- u32 nlinks;
-
- nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
- if (nlinks == 0) {
+ if (result == BTRFS_COMPARE_TREE_NEW) {
+ if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
sctx->ignore_cur_inode = true;
- if (result == BTRFS_COMPARE_TREE_CHANGED)
- ret = btrfs_unlink_all_paths(sctx);
goto out;
}
- }
-
- if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6218,13 +6640,23 @@ static int changed_inode(struct send_ctx *sctx,
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->right_path->nodes[0], right_ii);
} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 new_nlinks, old_nlinks;
+
+ new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
+ if (new_nlinks == 0 && old_nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ goto out;
+ } else if (new_nlinks == 0 || old_nlinks == 0) {
+ sctx->cur_inode_new_gen = 1;
+ }
/*
* We need to do some special handling in case the inode was
* reported as changed with a changed generation number. This
@@ -6236,58 +6668,66 @@ static int changed_inode(struct send_ctx *sctx,
/*
* First, process the inode as if it was deleted.
*/
- sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->right_path->nodes[0], right_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->right_path->nodes[0], right_ii);
- ret = process_all_refs(sctx,
- BTRFS_COMPARE_TREE_DELETED);
- if (ret < 0)
- goto out;
+ if (old_nlinks > 0) {
+ sctx->cur_inode_gen = right_gen;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->right_path->nodes[0], right_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->right_path->nodes[0], right_ii);
+ ret = process_all_refs(sctx,
+ BTRFS_COMPARE_TREE_DELETED);
+ if (ret < 0)
+ goto out;
+ }
/*
* Now process the inode as if it was new.
*/
- sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_rdev = btrfs_inode_rdev(
- sctx->left_path->nodes[0], left_ii);
- ret = send_create_inode_if_needed(sctx);
- if (ret < 0)
- goto out;
+ if (new_nlinks > 0) {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0],
+ left_ii);
+ ret = send_create_inode_if_needed(sctx);
+ if (ret < 0)
+ goto out;
- ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
- if (ret < 0)
- goto out;
- /*
- * Advance send_progress now as we did not get into
- * process_recorded_refs_if_needed in the new_gen case.
- */
- sctx->send_progress = sctx->cur_ino + 1;
+ ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+ if (ret < 0)
+ goto out;
+ /*
+ * Advance send_progress now as we did not get
+ * into process_recorded_refs_if_needed in the
+ * new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
- /*
- * Now process all extents and xattrs of the inode as if
- * they were all new.
- */
- ret = process_all_extents(sctx);
- if (ret < 0)
- goto out;
- ret = process_all_new_xattrs(sctx);
- if (ret < 0)
- goto out;
+ /*
+ * Now process all extents and xattrs of the
+ * inode as if they were all new.
+ */
+ ret = process_all_extents(sctx);
+ if (ret < 0)
+ goto out;
+ ret = process_all_new_xattrs(sctx);
+ if (ret < 0)
+ goto out;
+ }
} else {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_new_gen = 0;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_new_gen = false;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6379,7 +6819,7 @@ static int changed_extent(struct send_ctx *sctx,
* updates the inode item, but it only changes the iversion (sequence
* field in the inode item) of the inode, so if a file is deduplicated
* the same amount of times in both the parent and send snapshots, its
- * iversion becames the same in both snapshots, whence the inode item is
+ * iversion becomes the same in both snapshots, whence the inode item is
* the same on both snapshots.
*/
if (sctx->cur_ino != sctx->cmp_key->objectid)
@@ -6394,18 +6834,27 @@ static int changed_extent(struct send_ctx *sctx,
return ret;
}
+static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ sctx->cur_inode_needs_verity = true;
+ }
+ return ret;
+}
+
static int dir_changed(struct send_ctx *sctx, u64 dir)
{
u64 orig_gen, new_gen;
int ret;
- ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &new_gen);
if (ret)
return ret;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
if (ret)
return ret;
@@ -6433,7 +6882,7 @@ static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
}
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *)(ptr +
@@ -6460,10 +6909,53 @@ static int changed_cb(struct btrfs_path *left_path,
struct btrfs_path *right_path,
struct btrfs_key *key,
enum btrfs_compare_tree_result result,
- void *ctx)
+ struct send_ctx *sctx)
{
int ret = 0;
- struct send_ctx *sctx = ctx;
+
+ /*
+ * We can not hold the commit root semaphore here. This is because in
+ * the case of sending and receiving to the same filesystem, using a
+ * pipe, could result in a deadlock:
+ *
+ * 1) The task running send blocks on the pipe because it's full;
+ *
+ * 2) The task running receive, which is the only consumer of the pipe,
+ * is waiting for a transaction commit (for example due to a space
+ * reservation when doing a write or triggering a transaction commit
+ * when creating a subvolume);
+ *
+ * 3) The transaction is waiting to write lock the commit root semaphore,
+ * but can not acquire it since it's being held at 1).
+ *
+ * Down this call chain we write to the pipe through kernel_write().
+ * The same type of problem can also happen when sending to a file that
+ * is stored in the same filesystem - when reserving space for a write
+ * into the file, we can trigger a transaction commit.
+ *
+ * Our caller has supplied us with clones of leaves from the send and
+ * parent roots, so we're safe here from a concurrent relocation and
+ * further reallocation of metadata extents while we are here. Below we
+ * also assert that the leaves are clones.
+ */
+ lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+ /*
+ * We always have a send root, so left_path is never NULL. We will not
+ * have a leaf when we have reached the end of the send root but have
+ * not yet reached the end of the parent root.
+ */
+ if (left_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &left_path->nodes[0]->bflags));
+ /*
+ * When doing a full send we don't have a parent root, so right_path is
+ * NULL. When doing an incremental send, we may have reached the end of
+ * the parent root already, so we don't have a leaf at right_path.
+ */
+ if (right_path && right_path->nodes[0])
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+ &right_path->nodes[0]->bflags));
if (result == BTRFS_COMPARE_TREE_SAME) {
if (key->type == BTRFS_INODE_REF_KEY ||
@@ -6505,29 +6997,69 @@ static int changed_cb(struct btrfs_path *left_path,
ret = changed_xattr(sctx, result);
else if (key->type == BTRFS_EXTENT_DATA_KEY)
ret = changed_extent(sctx, result);
+ else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
+ key->offset == 0)
+ ret = changed_verity(sctx, result);
}
out:
return ret;
}
+static int search_key_again(const struct send_ctx *sctx,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct btrfs_key *key)
+{
+ int ret;
+
+ if (!path->need_commit_sem)
+ lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+ /*
+ * Roots used for send operations are readonly and no one can add,
+ * update or remove keys from them, so we should be able to find our
+ * key again. The only exception is deduplication, which can operate on
+ * readonly roots and add, update or remove keys to/from them - but at
+ * the moment we don't allow it to run in parallel with send.
+ */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ ASSERT(ret <= 0);
+ if (ret > 0) {
+ btrfs_print_tree(path->nodes[path->lowest_level], false);
+ btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+ key->objectid, key->type, key->offset,
+ (root == sctx->parent_root ? "parent" : "send"),
+ root->root_key.objectid, path->lowest_level,
+ path->slots[path->lowest_level]);
+ return -EUCLEAN;
+ }
+
+ return ret;
+}
+
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
+ struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_path *path;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
+ path->reada = READA_FORWARD_ALWAYS;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
+ down_read(&fs_info->commit_root_sem);
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -6535,15 +7067,35 @@ static int full_send_tree(struct send_ctx *sctx)
goto out_finish;
while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &key, slot);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ret = changed_cb(path, NULL, &key,
BTRFS_COMPARE_TREE_NEW, sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ up_read(&fs_info->commit_root_sem);
+ /*
+ * A transaction used for relocating a block group was
+ * committed or is about to finish its commit. Release
+ * our path (leaf) and restart the search, so that we
+ * avoid operating on any file extent items that are
+ * stale, with a disk_bytenr that reflects a pre
+ * relocation value. This way we avoid as much as
+ * possible to fallback to regular writes when checking
+ * if we can clone file ranges.
+ */
+ btrfs_release_path(path);
+ ret = search_key_again(sctx, send_root, path, &key);
+ if (ret < 0)
+ goto out;
+ } else {
+ up_read(&fs_info->commit_root_sem);
+ }
+
ret = btrfs_next_item(send_root, path);
if (ret < 0)
goto out;
@@ -6561,18 +7113,58 @@ out:
return ret;
}
-static int tree_move_down(struct btrfs_path *path, int *level)
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+ struct extent_buffer *clone;
+
+ clone = btrfs_clone_extent_buffer(path->nodes[level]);
+ if (!clone)
+ return -ENOMEM;
+
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = clone;
+
+ return 0;
+}
+
+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
{
struct extent_buffer *eb;
+ struct extent_buffer *parent = path->nodes[*level];
+ int slot = path->slots[*level];
+ const int nritems = btrfs_header_nritems(parent);
+ u64 reada_max;
+ u64 reada_done = 0;
+
+ lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
BUG_ON(*level == 0);
- eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
+ eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
return PTR_ERR(eb);
+ /*
+ * Trigger readahead for the next leaves we will process, so that it is
+ * very likely that when we need them they are already in memory and we
+ * will not block on disk IO. For nodes we only do readahead for one,
+ * since the time window between processing nodes is typically larger.
+ */
+ reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
+
+ for (slot++; slot < nritems && reada_done < reada_max; slot++) {
+ if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
+ btrfs_readahead_node_child(parent, slot);
+ reada_done += eb->fs_info->nodesize;
+ }
+ }
+
path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
+
+ if (*level == 0)
+ return replace_node_with_clone(path, 0);
+
return 0;
}
@@ -6586,8 +7178,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
path->slots[*level]++;
while (path->slots[*level] >= nritems) {
- if (*level == root_level)
+ if (*level == root_level) {
+ path->slots[*level] = nritems - 1;
return -1;
+ }
/* move upnext */
path->slots[*level] = 0;
@@ -6609,23 +7203,30 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
static int tree_advance(struct btrfs_path *path,
int *level, int root_level,
int allow_down,
- struct btrfs_key *key)
+ struct btrfs_key *key,
+ u64 reada_min_gen)
{
int ret;
if (*level == 0 || !allow_down) {
ret = tree_move_next_or_upnext(path, level, root_level);
} else {
- ret = tree_move_down(path, level);
- }
- if (ret >= 0) {
- if (*level == 0)
- btrfs_item_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
- else
- btrfs_node_key_to_cpu(path->nodes[*level], key,
- path->slots[*level]);
+ ret = tree_move_down(path, level, reada_min_gen);
}
+
+ /*
+ * Even if we have reached the end of a tree, ret is -1, update the key
+ * anyway, so that in case we need to restart due to a block group
+ * relocation, we can assert that the last key of the root node still
+ * exists in the tree.
+ */
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+
return ret;
}
@@ -6637,8 +7238,8 @@ static int tree_compare_item(struct btrfs_path *left_path,
int len1, len2;
unsigned long off1, off2;
- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
if (len1 != len2)
return 1;
@@ -6655,6 +7256,97 @@ static int tree_compare_item(struct btrfs_path *left_path,
}
/*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ * any locks on them, which does not prevent them from having been relocated
+ * and reallocated after the last time we released the commit root semaphore.
+ * The exception are the root nodes, for which we always have a clone, see
+ * the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ * we are safe from the concurrent relocation and reallocation. However they
+ * can have file extent items with a pre relocation disk_bytenr value, so we
+ * restart the start from the current commit roots and clone the new leaves so
+ * that we get the post relocation disk_bytenr values. Not doing so, could
+ * make us clone the wrong data in case there are new extents using the old
+ * disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ const struct btrfs_key *left_key,
+ const struct btrfs_key *right_key,
+ int left_level,
+ int right_level,
+ const struct send_ctx *sctx)
+{
+ int root_level;
+ int ret;
+
+ lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+ btrfs_release_path(left_path);
+ btrfs_release_path(right_path);
+
+ /*
+ * Since keys can not be added or removed to/from our roots because they
+ * are readonly and we do not allow deduplication to run in parallel
+ * (which can add, remove or change keys), the layout of the trees should
+ * not change.
+ */
+ left_path->lowest_level = left_level;
+ ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+ if (ret < 0)
+ return ret;
+
+ right_path->lowest_level = right_level;
+ ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If the lowest level nodes are leaves, clone them so that they can be
+ * safely used by changed_cb() while not under the protection of the
+ * commit root semaphore, even if relocation and reallocation happens in
+ * parallel.
+ */
+ if (left_level == 0) {
+ ret = replace_node_with_clone(left_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (right_level == 0) {
+ ret = replace_node_with_clone(right_path, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * Now clone the root nodes (unless they happen to be the leaves we have
+ * already cloned). This is to protect against concurrent snapshotting of
+ * the send and parent roots (see the comment at btrfs_compare_trees()).
+ */
+ root_level = btrfs_header_level(sctx->send_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(left_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ root_level = btrfs_header_level(sctx->parent_root->commit_root);
+ if (root_level > 0) {
+ ret = replace_node_with_clone(right_path, root_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
* This function compares two trees and calls the provided callback for
* every changed/new/deleted item it finds.
* If shared tree blocks are encountered, whole subtrees are skipped, making
@@ -6668,8 +7360,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, struct send_ctx *sctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6683,14 +7374,15 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
int right_root_level;
int left_level;
int right_level;
- int left_end_reached;
- int right_end_reached;
- int advance_left;
- int advance_right;
+ int left_end_reached = 0;
+ int right_end_reached = 0;
+ int advance_left = 0;
+ int advance_right = 0;
u64 left_blockptr;
u64 right_blockptr;
u64 left_gen;
u64 right_gen;
+ u64 reada_min_gen;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -6753,12 +7445,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
down_read(&fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
+ /*
+ * We clone the root node of the send and parent roots to prevent races
+ * with snapshot creation of these roots. Snapshot creation COWs the
+ * root node of a tree, so after the transaction is committed the old
+ * extent can be reallocated while this send operation is still ongoing.
+ * So we clone them, under the commit root semaphore, to be race free.
+ */
left_path->nodes[left_level] =
btrfs_clone_extent_buffer(left_root->commit_root);
if (!left_path->nodes[left_level]) {
- up_read(&fs_info->commit_root_sem);
ret = -ENOMEM;
- goto out;
+ goto out_unlock;
}
right_level = btrfs_header_level(right_root->commit_root);
@@ -6766,11 +7464,17 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->nodes[right_level] =
btrfs_clone_extent_buffer(right_root->commit_root);
if (!right_path->nodes[right_level]) {
- up_read(&fs_info->commit_root_sem);
ret = -ENOMEM;
- goto out;
+ goto out_unlock;
}
- up_read(&fs_info->commit_root_sem);
+ /*
+ * Our right root is the parent root, while the left root is the "send"
+ * root. We know that all new nodes/leaves in the left root must have
+ * a generation greater than the right root's generation, so we trigger
+ * readahead for those nodes and leaves of the left root, as we know we
+ * will need to read them at some point.
+ */
+ reada_min_gen = btrfs_header_generation(right_root->commit_root);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -6785,78 +7489,94 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
btrfs_node_key_to_cpu(right_path->nodes[right_level],
&right_key, right_path->slots[right_level]);
- left_end_reached = right_end_reached = 0;
- advance_left = advance_right = 0;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
while (1) {
- cond_resched();
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
+ up_read(&fs_info->commit_root_sem);
+ cond_resched();
+ down_read(&fs_info->commit_root_sem);
+ }
+
+ if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+ ret = restart_after_relocation(left_path, right_path,
+ &left_key, &right_key,
+ left_level, right_level,
+ sctx);
+ if (ret < 0)
+ goto out_unlock;
+ sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ }
+
if (advance_left && !left_end_reached) {
ret = tree_advance(left_path, &left_level,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
- &left_key);
+ &left_key, reada_min_gen);
if (ret == -1)
left_end_reached = ADVANCE;
else if (ret < 0)
- goto out;
+ goto out_unlock;
advance_left = 0;
}
if (advance_right && !right_end_reached) {
ret = tree_advance(right_path, &right_level,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
- &right_key);
+ &right_key, reada_min_gen);
if (ret == -1)
right_end_reached = ADVANCE;
else if (ret < 0)
- goto out;
+ goto out_unlock;
advance_right = 0;
}
if (left_end_reached && right_end_reached) {
ret = 0;
- goto out;
+ goto out_unlock;
} else if (left_end_reached) {
if (right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
- ctx);
+ sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
}
advance_right = ADVANCE;
continue;
} else if (right_end_reached) {
if (left_level == 0) {
+ up_read(&fs_info->commit_root_sem);
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
- ctx);
+ sctx);
if (ret < 0)
goto out;
+ down_read(&fs_info->commit_root_sem);
}
advance_left = ADVANCE;
continue;
}
if (left_level == 0 && right_level == 0) {
+ up_read(&fs_info->commit_root_sem);
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
- ctx);
- if (ret < 0)
- goto out;
+ sctx);
advance_left = ADVANCE;
} else if (cmp > 0) {
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
- ctx);
- if (ret < 0)
- goto out;
+ sctx);
advance_right = ADVANCE;
} else {
enum btrfs_compare_tree_result result;
@@ -6869,12 +7589,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
else
result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_path, right_path,
- &left_key, result, ctx);
- if (ret < 0)
- goto out;
+ &left_key, result, sctx);
advance_left = ADVANCE;
advance_right = ADVANCE;
}
+
+ if (ret < 0)
+ goto out;
+ down_read(&fs_info->commit_root_sem);
} else if (left_level == right_level) {
cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
if (cmp < 0) {
@@ -6914,6 +7636,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
}
}
+out_unlock:
+ up_read(&fs_info->commit_root_sem);
out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
@@ -6936,8 +7660,7 @@ static int send_subvol(struct send_ctx *sctx)
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7014,7 +7737,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
int i;
if (root) {
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
@@ -7022,7 +7745,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
for (i = 0; i < sctx->clone_roots_cnt; i++) {
root = sctx->clone_roots[i].root;
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
@@ -7053,20 +7776,18 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
root->root_key.objectid, root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
+ struct btrfs_root *send_root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
- struct btrfs_key key;
struct send_ctx *sctx = NULL;
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
- int index;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -7104,13 +7825,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- if (!access_ok(arg->clone_sources,
- sizeof(*arg->clone_sources) *
- arg->clone_sources_count)) {
- ret = -EFAULT;
- goto out;
- }
-
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
ret = -EINVAL;
goto out;
@@ -7129,6 +7843,21 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
+ if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+ if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+ ret = -EPROTO;
+ goto out;
+ }
+ /* Zero means "use the highest version" */
+ sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+ } else {
+ sctx->proto = 1;
+ }
+ if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
+
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
ret = -EBADF;
@@ -7147,15 +7876,32 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->clone_roots_cnt = arg->clone_sources_count;
- sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
- if (!sctx->send_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ if (sctx->proto >= 2) {
+ u32 send_buf_num_pages;
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
+ sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
+ sctx->send_buf_pages = kcalloc(send_buf_num_pages,
+ sizeof(*sctx->send_buf_pages),
+ GFP_KERNEL);
+ if (!sctx->send_buf_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < send_buf_num_pages; i++) {
+ sctx->send_buf_pages[i] =
+ vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
+ }
+ } else {
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ }
+ if (!sctx->send_buf) {
ret = -ENOMEM;
goto out;
}
@@ -7163,16 +7909,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
+ sctx->rbtree_new_refs = RB_ROOT;
+ sctx->rbtree_deleted_refs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7189,15 +7938,9 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
}
for (i = 0; i < arg->clone_sources_count; i++) {
- key.objectid = clone_sources_tmp[i];
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ clone_root = btrfs_get_fs_root(fs_info,
+ clone_sources_tmp[i], true);
if (IS_ERR(clone_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
@@ -7205,20 +7948,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (!btrfs_root_readonly(clone_root) ||
btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EPERM;
goto out;
}
if (clone_root->dedupe_in_progress) {
dedupe_in_progress_warn(clone_root);
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EAGAIN;
goto out;
}
clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
@@ -7228,15 +7970,9 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
}
if (arg->parent_root) {
- key.objectid = arg->parent_root;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
+ true);
if (IS_ERR(sctx->parent_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(sctx->parent_root);
goto out;
}
@@ -7246,20 +7982,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (!btrfs_root_readonly(sctx->parent_root) ||
btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
if (sctx->parent_root->dedupe_in_progress) {
dedupe_in_progress_warn(sctx->parent_root);
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EAGAIN;
goto out;
}
spin_unlock(&sctx->parent_root->root_item_lock);
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
}
/*
@@ -7267,7 +7999,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
* is behind the current send position. This is checked while searching
* for possible clone sources.
*/
- sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+ sctx->clone_roots[sctx->clone_roots_cnt++].root =
+ btrfs_grab_root(sctx->send_root);
/* We do a bsearch later */
sort(sctx->clone_roots, sctx->clone_roots_cnt,
@@ -7283,23 +8016,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (ret)
goto out;
- mutex_lock(&fs_info->balance_mutex);
- if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
- btrfs_warn_rl(fs_info,
- "cannot run send because a balance operation is in progress");
- ret = -EAGAIN;
- goto out;
- }
- fs_info->send_in_progress++;
- mutex_unlock(&fs_info->balance_mutex);
-
- current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
- current->journal_info = NULL;
- mutex_lock(&fs_info->balance_mutex);
- fs_info->send_in_progress--;
- mutex_unlock(&fs_info->balance_mutex);
if (ret < 0)
goto out;
@@ -7352,18 +8069,24 @@ out:
}
if (sort_clone_roots) {
- for (i = 0; i < sctx->clone_roots_cnt; i++)
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
} else {
- for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
btrfs_root_dec_send_in_progress(send_root);
}
- if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
btrfs_root_dec_send_in_progress(sctx->parent_root);
+ btrfs_put_root(sctx->parent_root);
+ }
kvfree(clone_sources_tmp);
@@ -7372,11 +8095,14 @@ out:
fput(sctx->send_filp);
kvfree(sctx->clone_roots);
+ kfree(sctx->send_buf_pages);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
+ kvfree(sctx->verity_descriptor);
name_cache_free(sctx);
+ close_current_inode(sctx);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f7034f..f7585cfa7e52 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -7,13 +7,24 @@
#ifndef BTRFS_SEND_H
#define BTRFS_SEND_H
-#include "ctree.h"
+#include <linux/types.h>
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
-#define BTRFS_SEND_STREAM_VERSION 1
+/* Conditional support for the upcoming protocol version. */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_SEND_STREAM_VERSION 3
+#else
+#define BTRFS_SEND_STREAM_VERSION 2
+#endif
+
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
-#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
+struct inode;
+struct btrfs_ioctl_send_args;
enum btrfs_tlv_type {
BTRFS_TLV_U8,
@@ -47,80 +58,126 @@ struct btrfs_tlv_header {
/* commands */
enum btrfs_send_cmd {
- BTRFS_SEND_C_UNSPEC,
-
- BTRFS_SEND_C_SUBVOL,
- BTRFS_SEND_C_SNAPSHOT,
-
- BTRFS_SEND_C_MKFILE,
- BTRFS_SEND_C_MKDIR,
- BTRFS_SEND_C_MKNOD,
- BTRFS_SEND_C_MKFIFO,
- BTRFS_SEND_C_MKSOCK,
- BTRFS_SEND_C_SYMLINK,
-
- BTRFS_SEND_C_RENAME,
- BTRFS_SEND_C_LINK,
- BTRFS_SEND_C_UNLINK,
- BTRFS_SEND_C_RMDIR,
-
- BTRFS_SEND_C_SET_XATTR,
- BTRFS_SEND_C_REMOVE_XATTR,
-
- BTRFS_SEND_C_WRITE,
- BTRFS_SEND_C_CLONE,
-
- BTRFS_SEND_C_TRUNCATE,
- BTRFS_SEND_C_CHMOD,
- BTRFS_SEND_C_CHOWN,
- BTRFS_SEND_C_UTIMES,
-
- BTRFS_SEND_C_END,
- BTRFS_SEND_C_UPDATE_EXTENT,
- __BTRFS_SEND_C_MAX,
+ BTRFS_SEND_C_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_C_SUBVOL = 1,
+ BTRFS_SEND_C_SNAPSHOT = 2,
+
+ BTRFS_SEND_C_MKFILE = 3,
+ BTRFS_SEND_C_MKDIR = 4,
+ BTRFS_SEND_C_MKNOD = 5,
+ BTRFS_SEND_C_MKFIFO = 6,
+ BTRFS_SEND_C_MKSOCK = 7,
+ BTRFS_SEND_C_SYMLINK = 8,
+
+ BTRFS_SEND_C_RENAME = 9,
+ BTRFS_SEND_C_LINK = 10,
+ BTRFS_SEND_C_UNLINK = 11,
+ BTRFS_SEND_C_RMDIR = 12,
+
+ BTRFS_SEND_C_SET_XATTR = 13,
+ BTRFS_SEND_C_REMOVE_XATTR = 14,
+
+ BTRFS_SEND_C_WRITE = 15,
+ BTRFS_SEND_C_CLONE = 16,
+
+ BTRFS_SEND_C_TRUNCATE = 17,
+ BTRFS_SEND_C_CHMOD = 18,
+ BTRFS_SEND_C_CHOWN = 19,
+ BTRFS_SEND_C_UTIMES = 20,
+
+ BTRFS_SEND_C_END = 21,
+ BTRFS_SEND_C_UPDATE_EXTENT = 22,
+ BTRFS_SEND_C_MAX_V1 = 22,
+
+ /* Version 2 */
+ BTRFS_SEND_C_FALLOCATE = 23,
+ BTRFS_SEND_C_FILEATTR = 24,
+ BTRFS_SEND_C_ENCODED_WRITE = 25,
+ BTRFS_SEND_C_MAX_V2 = 25,
+
+ /* Version 3 */
+ BTRFS_SEND_C_ENABLE_VERITY = 26,
+ BTRFS_SEND_C_MAX_V3 = 26,
+ /* End */
+ BTRFS_SEND_C_MAX = 26,
};
-#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
/* attributes in send stream */
enum {
- BTRFS_SEND_A_UNSPEC,
-
- BTRFS_SEND_A_UUID,
- BTRFS_SEND_A_CTRANSID,
-
- BTRFS_SEND_A_INO,
- BTRFS_SEND_A_SIZE,
- BTRFS_SEND_A_MODE,
- BTRFS_SEND_A_UID,
- BTRFS_SEND_A_GID,
- BTRFS_SEND_A_RDEV,
- BTRFS_SEND_A_CTIME,
- BTRFS_SEND_A_MTIME,
- BTRFS_SEND_A_ATIME,
- BTRFS_SEND_A_OTIME,
-
- BTRFS_SEND_A_XATTR_NAME,
- BTRFS_SEND_A_XATTR_DATA,
-
- BTRFS_SEND_A_PATH,
- BTRFS_SEND_A_PATH_TO,
- BTRFS_SEND_A_PATH_LINK,
-
- BTRFS_SEND_A_FILE_OFFSET,
- BTRFS_SEND_A_DATA,
-
- BTRFS_SEND_A_CLONE_UUID,
- BTRFS_SEND_A_CLONE_CTRANSID,
- BTRFS_SEND_A_CLONE_PATH,
- BTRFS_SEND_A_CLONE_OFFSET,
- BTRFS_SEND_A_CLONE_LEN,
-
- __BTRFS_SEND_A_MAX,
+ BTRFS_SEND_A_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_A_UUID = 1,
+ BTRFS_SEND_A_CTRANSID = 2,
+
+ BTRFS_SEND_A_INO = 3,
+ BTRFS_SEND_A_SIZE = 4,
+ BTRFS_SEND_A_MODE = 5,
+ BTRFS_SEND_A_UID = 6,
+ BTRFS_SEND_A_GID = 7,
+ BTRFS_SEND_A_RDEV = 8,
+ BTRFS_SEND_A_CTIME = 9,
+ BTRFS_SEND_A_MTIME = 10,
+ BTRFS_SEND_A_ATIME = 11,
+ BTRFS_SEND_A_OTIME = 12,
+
+ BTRFS_SEND_A_XATTR_NAME = 13,
+ BTRFS_SEND_A_XATTR_DATA = 14,
+
+ BTRFS_SEND_A_PATH = 15,
+ BTRFS_SEND_A_PATH_TO = 16,
+ BTRFS_SEND_A_PATH_LINK = 17,
+
+ BTRFS_SEND_A_FILE_OFFSET = 18,
+ /*
+ * As of send stream v2, this attribute is special: it must be the last
+ * attribute in a command, its header contains only the type, and its
+ * length is implicitly the remaining length of the command.
+ */
+ BTRFS_SEND_A_DATA = 19,
+
+ BTRFS_SEND_A_CLONE_UUID = 20,
+ BTRFS_SEND_A_CLONE_CTRANSID = 21,
+ BTRFS_SEND_A_CLONE_PATH = 22,
+ BTRFS_SEND_A_CLONE_OFFSET = 23,
+ BTRFS_SEND_A_CLONE_LEN = 24,
+
+ BTRFS_SEND_A_MAX_V1 = 24,
+
+ /* Version 2 */
+ BTRFS_SEND_A_FALLOCATE_MODE = 25,
+
+ /*
+ * File attributes from the FS_*_FL namespace (i_flags, xflags),
+ * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored
+ * in btrfs_inode_item::flags (represented by btrfs_inode::flags and
+ * btrfs_inode::ro_flags).
+ */
+ BTRFS_SEND_A_FILEATTR = 26,
+
+ BTRFS_SEND_A_UNENCODED_FILE_LEN = 27,
+ BTRFS_SEND_A_UNENCODED_LEN = 28,
+ BTRFS_SEND_A_UNENCODED_OFFSET = 29,
+ /*
+ * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+ * BTRFS_SEND_C_ENCODED_WRITE.
+ */
+ BTRFS_SEND_A_COMPRESSION = 30,
+ BTRFS_SEND_A_ENCRYPTION = 31,
+ BTRFS_SEND_A_MAX_V2 = 31,
+
+ /* Version 3 */
+ BTRFS_SEND_A_VERITY_ALGORITHM = 32,
+ BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33,
+ BTRFS_SEND_A_VERITY_SALT_DATA = 34,
+ BTRFS_SEND_A_VERITY_SIG_DATA = 35,
+ BTRFS_SEND_A_MAX_V3 = 35,
+
+ __BTRFS_SEND_A_MAX = 35,
};
-#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
-#ifdef __KERNEL__
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
-#endif
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 01297c5b2666..f171bf875633 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,6 +9,155 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
+#include "zoned.h"
+
+/*
+ * HOW DOES SPACE RESERVATION WORK
+ *
+ * If you want to know about delalloc specifically, there is a separate comment
+ * for that with the delalloc code. This comment is about how the whole system
+ * works generally.
+ *
+ * BASIC CONCEPTS
+ *
+ * 1) space_info. This is the ultimate arbiter of how much space we can use.
+ * There's a description of the bytes_ fields with the struct declaration,
+ * refer to that for specifics on each field. Suffice it to say that for
+ * reservations we care about total_bytes - SUM(space_info->bytes_) when
+ * determining if there is space to make an allocation. There is a space_info
+ * for METADATA, SYSTEM, and DATA areas.
+ *
+ * 2) block_rsv's. These are basically buckets for every different type of
+ * metadata reservation we have. You can see the comment in the block_rsv
+ * code on the rules for each type, but generally block_rsv->reserved is how
+ * much space is accounted for in space_info->bytes_may_use.
+ *
+ * 3) btrfs_calc*_size. These are the worst case calculations we used based
+ * on the number of items we will want to modify. We have one for changing
+ * items, and one for inserting new items. Generally we use these helpers to
+ * determine the size of the block reserves, and then use the actual bytes
+ * values to adjust the space_info counters.
+ *
+ * MAKING RESERVATIONS, THE NORMAL CASE
+ *
+ * We call into either btrfs_reserve_data_bytes() or
+ * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
+ * num_bytes we want to reserve.
+ *
+ * ->reserve
+ * space_info->bytes_may_reserve += num_bytes
+ *
+ * ->extent allocation
+ * Call btrfs_add_reserved_bytes() which does
+ * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_reserved += extent_bytes
+ *
+ * ->insert reference
+ * Call btrfs_update_block_group() which does
+ * space_info->bytes_reserved -= extent_bytes
+ * space_info->bytes_used += extent_bytes
+ *
+ * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
+ *
+ * Assume we are unable to simply make the reservation because we do not have
+ * enough space
+ *
+ * -> __reserve_bytes
+ * create a reserve_ticket with ->bytes set to our reservation, add it to
+ * the tail of space_info->tickets, kick async flush thread
+ *
+ * ->handle_reserve_ticket
+ * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
+ * on the ticket.
+ *
+ * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
+ * Flushes various things attempting to free up space.
+ *
+ * -> btrfs_try_granting_tickets()
+ * This is called by anything that either subtracts space from
+ * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
+ * space_info->total_bytes. This loops through the ->priority_tickets and
+ * then the ->tickets list checking to see if the reservation can be
+ * completed. If it can the space is added to space_info->bytes_may_use and
+ * the ticket is woken up.
+ *
+ * -> ticket wakeup
+ * Check if ->bytes == 0, if it does we got our reservation and we can carry
+ * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
+ * were interrupted.)
+ *
+ * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
+ *
+ * Same as the above, except we add ourselves to the
+ * space_info->priority_tickets, and we do not use ticket->wait, we simply
+ * call flush_space() ourselves for the states that are safe for us to call
+ * without deadlocking and hope for the best.
+ *
+ * THE FLUSHING STATES
+ *
+ * Generally speaking we will have two cases for each state, a "nice" state
+ * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
+ * reduce the locking over head on the various trees, and even to keep from
+ * doing any work at all in the case of delayed refs. Each of these delayed
+ * things however hold reservations, and so letting them run allows us to
+ * reclaim space so we can make new reservations.
+ *
+ * FLUSH_DELAYED_ITEMS
+ * Every inode has a delayed item to update the inode. Take a simple write
+ * for example, we would update the inode item at write time to update the
+ * mtime, and then again at finish_ordered_io() time in order to update the
+ * isize or bytes. We keep these delayed items to coalesce these operations
+ * into a single operation done on demand. These are an easy way to reclaim
+ * metadata space.
+ *
+ * FLUSH_DELALLOC
+ * Look at the delalloc comment to get an idea of how much space is reserved
+ * for delayed allocation. We can reclaim some of this space simply by
+ * running delalloc, but usually we need to wait for ordered extents to
+ * reclaim the bulk of this space.
+ *
+ * FLUSH_DELAYED_REFS
+ * We have a block reserve for the outstanding delayed refs space, and every
+ * delayed ref operation holds a reservation. Running these is a quick way
+ * to reclaim space, but we want to hold this until the end because COW can
+ * churn a lot and we can avoid making some extent tree modifications if we
+ * are able to delay for as long as possible.
+ *
+ * ALLOC_CHUNK
+ * We will skip this the first time through space reservation, because of
+ * overcommit and we don't want to have a lot of useless metadata space when
+ * our worst case reservations will likely never come true.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we're freeing inodes we're likely freeing checksums, file extent
+ * items, and extent tree items. Loads of space could be freed up by these
+ * operations, however they won't be usable until the transaction commits.
+ *
+ * COMMIT_TRANS
+ * This will commit the transaction. Historically we had a lot of logic
+ * surrounding whether or not we'd commit the transaction, but this waits born
+ * out of a pre-tickets era where we could end up committing the transaction
+ * thousands of times in a row without making progress. Now thanks to our
+ * ticketing system we know if we're not making progress and can error
+ * everybody out after a few commits rather than burning the disk hoping for
+ * a different answer.
+ *
+ * OVERCOMMIT
+ *
+ * Because we hold so many reservations for metadata we will allow you to
+ * reserve more space than is currently free in the currently allocate
+ * metadata space. This only happens with metadata, data does not allow
+ * overcommitting.
+ *
+ * You can see the current logic for when we allow overcommit in
+ * btrfs_can_overcommit(), but it only applies to unallocated space. If there
+ * is no unallocated space to be had, all reservations are kept within the
+ * free space in the allocated metadata chunks.
+ *
+ * Because of overcommitting, you generally want to use the
+ * btrfs_can_overcommit() logic for metadata allocations, as it does the right
+ * thing with or without extra unallocated space.
+ */
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
@@ -16,6 +165,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
ASSERT(s_info);
return s_info->bytes_used + s_info->bytes_reserved +
s_info->bytes_pinned + s_info->bytes_readonly +
+ s_info->bytes_zone_unusable +
(may_use_included ? s_info->bytes_may_use : 0);
}
@@ -28,10 +178,45 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
+}
+
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
+/*
+ * Calculate chunk size depending on volume type (regular or zoned).
+ */
+static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (btrfs_is_zoned(fs_info))
+ return fs_info->zone_size;
+
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ return BTRFS_MAX_DATA_CHUNK_SIZE;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return SZ_32M;
+
+ /* Handle BTRFS_BLOCK_GROUP_METADATA */
+ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ return SZ_1G;
+
+ return SZ_256M;
+}
+
+/*
+ * Update default chunk size.
+ */
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size)
+{
+ WRITE_ONCE(space_info->chunk_size, chunk_size);
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -45,13 +230,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
- GFP_KERNEL);
- if (ret) {
- kfree(space_info);
- return ret;
- }
-
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -61,12 +239,17 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->ro_bgs);
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
+ space_info->clamp = 1;
+ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -110,29 +293,36 @@ out:
return ret;
}
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
- struct btrfs_space_info **space_info)
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
{
struct btrfs_space_info *found;
- int factor;
+ int factor, index;
- factor = btrfs_bg_type_to_factor(flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, flags);
+ found = btrfs_find_space_info(info, block_group->flags);
ASSERT(found);
spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- if (total_bytes > 0)
+ found->total_bytes += block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ found->active_total_bytes += block_group->length;
+ found->disk_total += block_group->length * factor;
+ found->bytes_used += block_group->used;
+ found->disk_used += block_group->used * factor;
+ found->bytes_readonly += block_group->bytes_super;
+ found->bytes_zone_unusable += block_group->zone_unusable;
+ if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
spin_unlock(&found->lock);
- *space_info = found;
+
+ block_group->space_info = found;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ down_write(&found->groups_sem);
+ list_add_tail(&block_group->list, &found->block_groups[index]);
+ up_write(&found->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -143,41 +333,26 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
- u64 used;
int factor;
- /* Don't overcommit when in mixed mode. */
- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
- return 0;
-
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
- used = btrfs_space_info_used(space_info, true);
avail = atomic64_read(&fs_info->free_chunk_space);
/*
@@ -198,12 +373,57 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+ return avail;
+}
- if (used + bytes < space_info->total_bytes + avail)
+static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ /*
+ * On regular filesystem, all total_bytes are always writable. On zoned
+ * filesystem, there may be a limitation imposed by max_active_zones.
+ * For metadata allocation, we cannot finish an existing active block
+ * group to avoid a deadlock. Thus, we need to consider only the active
+ * groups to be writable for metadata space.
+ */
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return space_info->total_bytes;
+
+ return space_info->active_total_bytes;
+}
+
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 avail;
+ u64 used;
+
+ /* Don't overcommit when in mixed mode */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ used = btrfs_space_info_used(space_info, true);
+ if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+ avail = 0;
+ else
+ avail = calc_available_free_space(fs_info, space_info, flush);
+
+ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
return 1;
return 0;
}
+static void remove_ticket(struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ if (!list_empty(&ticket->list)) {
+ list_del_init(&ticket->list);
+ ASSERT(space_info->reclaim_size >= ticket->bytes);
+ space_info->reclaim_size -= ticket->bytes;
+ }
+}
+
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
@@ -224,14 +444,14 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
- /* Check and see if our ticket can be satisified now. */
- if ((used + ticket->bytes <= space_info->total_bytes) ||
+ /* Check and see if our ticket can be satisfied now. */
+ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@@ -256,27 +476,47 @@ do { \
spin_unlock(&__rsv->lock); \
} while (0)
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info)
{
+ const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
- info->flags,
- info->total_bytes - btrfs_space_info_used(info, true),
+ /* The free space could be negative in case of overcommit */
+ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+ flag_str,
+ (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
- info->bytes_readonly);
-
- DUMP_BLOCK_RSV(fs_info, global_block_rsv);
- DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
- DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
+ info->bytes_readonly, info->bytes_zone_unusable);
}
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -288,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
spin_lock(&info->lock);
__btrfs_dump_space_info(fs_info, info);
+ dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
if (!dump_block_groups)
@@ -298,39 +539,18 @@ again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
btrfs_info(fs_info,
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
+ "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
cache->start, cache->length, cache->used, cache->pinned,
- cache->reserved, cache->ro ? "[readonly]" : "");
- btrfs_dump_free_space(cache, bytes);
+ cache->reserved, cache->zone_unusable,
+ cache->ro ? "[readonly]" : "");
spin_unlock(&cache->lock);
+ btrfs_dump_free_space(cache, bytes);
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -349,86 +569,100 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered,
+ bool for_preempt)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 dio_bytes;
- u64 async_pages;
+ u64 ordered_bytes;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
- /* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
-
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
- if (delalloc_bytes == 0 && dio_bytes == 0) {
- if (trans)
- return;
- if (wait_ordered)
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
return;
+
+ /* Calc the number of the pages we need flush for space reservation */
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
+ */
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
+ trans = current->journal_info;
+
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
- if (dio_bytes > delalloc_bytes)
+ if (ordered_bytes > delalloc_bytes && !for_preempt)
wait_ordered = true;
loops = 0;
- while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ while ((delalloc_bytes || ordered_bytes) && loops < 3) {
+ u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ long nr_pages = min_t(u64, temp, LONG_MAX);
+ int async_pages;
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ btrfs_start_delalloc_roots(fs_info, nr_pages, true);
/*
- * We need to wait for the compressed pages to start before
- * we continue.
+ * We need to make sure any outstanding async pages are now
+ * processed before we continue. This is because things like
+ * sync_inode() try to be smart and skip writing if the inode is
+ * marked clean. We don't use filemap_fwrite for flushing
+ * because we want to control how many pages we write out at a
+ * time, thus this is the only safe way to make sure we've
+ * waited for outstanding compressed workers to have started
+ * their jobs and thus have ordered extents set up properly.
+ *
+ * This exists because we do not want to wait for each
+ * individual inode to finish its async work, we simply want to
+ * start the IO on everybody, and then come back here and wait
+ * for all of the async work to catch up. Once we're done with
+ * that we know we'll have ordered extents for everything and we
+ * can decide if we wait for that or not.
+ *
+ * If we choose to replace this in the future, make absolutely
+ * sure that the proper waiting is being done in the async case,
+ * as there have been bugs in that area before.
*/
async_pages = atomic_read(&fs_info->async_delalloc_pages);
if (!async_pages)
goto skip_async;
/*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
+ * We don't want to wait forever, if we wrote less pages in this
+ * loop than we have outstanding, only wait for that number of
+ * pages, otherwise we can wait for all async pages to finish
+ * before continuing.
*/
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
+ if (async_pages > nr_pages)
async_pages -= nr_pages;
-
+ else
+ async_pages = 0;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
+ async_pages);
skip_async:
- spin_lock(&space_info->lock);
- if (list_empty(&space_info->tickets) &&
- list_empty(&space_info->priority_tickets)) {
- spin_unlock(&space_info->lock);
- break;
- }
- spin_unlock(&space_info->lock);
-
loops++;
if (wait_ordered && !trans) {
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
@@ -437,104 +671,28 @@ skip_async:
if (time_left)
break;
}
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
- }
-}
-
-/**
- * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does. Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
-{
- struct reserve_ticket *ticket = NULL;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_trans_handle *trans;
- u64 bytes_needed;
- u64 reclaim_bytes = 0;
- u64 cur_free_bytes = 0;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- if (trans)
- return -EAGAIN;
-
- spin_lock(&space_info->lock);
- cur_free_bytes = btrfs_space_info_used(space_info, true);
- if (cur_free_bytes < space_info->total_bytes)
- cur_free_bytes = space_info->total_bytes - cur_free_bytes;
- else
- cur_free_bytes = 0;
-
- if (!list_empty(&space_info->priority_tickets))
- ticket = list_first_entry(&space_info->priority_tickets,
- struct reserve_ticket, list);
- else if (!list_empty(&space_info->tickets))
- ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
-
- if (bytes_needed > cur_free_bytes)
- bytes_needed -= cur_free_bytes;
- else
- bytes_needed = 0;
- spin_unlock(&space_info->lock);
-
- if (!bytes_needed)
- return 0;
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ /*
+ * If we are for preemption we just want a one-shot of delalloc
+ * flushing so we can stop flushing if we decide we don't need
+ * to anymore.
+ */
+ if (for_preempt)
+ break;
- /*
- * See if there is enough pinned space to make this reservation, or if
- * we have block groups that are going to be freed, allowing us to
- * possibly do a chunk allocation the next loop through.
- */
- if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
- __percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
- goto commit;
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
+ spin_unlock(&space_info->lock);
- /*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
- */
- if (space_info != delayed_rsv->space_info)
- goto enospc;
-
- spin_lock(&delayed_rsv->lock);
- reclaim_bytes += delayed_rsv->reserved;
- spin_unlock(&delayed_rsv->lock);
-
- spin_lock(&delayed_refs_rsv->lock);
- reclaim_bytes += delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- if (reclaim_bytes >= bytes_needed)
- goto commit;
- bytes_needed -= reclaim_bytes;
-
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
- goto enospc;
-
-commit:
- return btrfs_commit_transaction(trans);
-enospc:
- btrfs_end_transaction(trans);
- return -ENOSPC;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(
+ &fs_info->ordered_bytes);
+ }
}
/*
@@ -544,9 +702,9 @@ enospc:
*/
static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
- int state)
+ enum btrfs_flush_state state, bool for_preempt)
{
- struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
@@ -569,8 +727,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
- state == FLUSH_DELALLOC_WAIT);
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
+ shrink_delalloc(fs_info, space_info, num_bytes,
+ state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -588,16 +749,45 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
+ /*
+ * For metadata space on zoned filesystem, reaching here means we
+ * don't have enough space left in active_total_bytes. Try to
+ * activate a block group first, because we may have inactive
+ * block group already allocated.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
+ if (ret < 0)
+ break;
+ else if (ret == 1)
+ break;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
+
+ /*
+ * For metadata space on zoned filesystem, allocating a new chunk
+ * is not enough. We still need to activate the block * group.
+ * Active the newly allocated block group by (maybe) finishing
+ * a block group.
+ */
+ if (ret == 1) {
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
+ /*
+ * Revert to the original ret regardless we could finish
+ * one block group or not.
+ */
+ if (ret >= 0)
+ ret = 1;
+ }
+
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
@@ -611,7 +801,13 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_wait_on_delayed_iputs(fs_info);
break;
case COMMIT_TRANS:
- ret = may_commit_transaction(fs_info, space_info);
+ ASSERT(current->journal_info == NULL);
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_commit_transaction(trans);
break;
default:
ret = -ENOSPC;
@@ -619,7 +815,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
}
trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
- ret);
+ ret, for_preempt);
return;
}
@@ -627,57 +823,174 @@ static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- struct reserve_ticket *ticket;
u64 used;
- u64 expected;
- u64 to_reclaim = 0;
-
- list_for_each_entry(ticket, &space_info->tickets, list)
- to_reclaim += ticket->bytes;
- list_for_each_entry(ticket, &space_info->priority_tickets, list)
- to_reclaim += ticket->bytes;
- if (to_reclaim)
- return to_reclaim;
-
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
- return 0;
+ u64 avail;
+ u64 total;
+ u64 to_reclaim = space_info->reclaim_size;
+
+ lockdep_assert_held(&space_info->lock);
+ avail = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
used = btrfs_space_info_used(space_info, true);
- if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
- BTRFS_RESERVE_FLUSH_ALL))
- expected = div_factor_fine(space_info->total_bytes, 95);
- else
- expected = div_factor_fine(space_info->total_bytes, 90);
+ /*
+ * We may be flushing because suddenly we have less space than we had
+ * before, and now we're well over-committed based on our current free
+ * space. If that's the case add in our overage so we make sure to put
+ * appropriate pressure on the flushing state machine.
+ */
+ total = writable_total_bytes(fs_info, space_info);
+ if (total + avail < used)
+ to_reclaim += used - (total + avail);
- if (used > expected)
- to_reclaim = used - expected;
- else
- to_reclaim = 0;
- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
- space_info->bytes_reserved);
return to_reclaim;
}
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 used)
+static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
{
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+ u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ u64 ordered, delalloc;
+ u64 total = writable_total_bytes(fs_info, space_info);
+ u64 thresh;
+ u64 used;
+
+ thresh = div_factor_fine(total, 90);
+
+ lockdep_assert_held(&space_info->lock);
/* If we're just plain full then async reclaim just slows us down. */
- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
- return 0;
+ if ((space_info->bytes_used + space_info->bytes_reserved +
+ global_rsv_size) >= thresh)
+ return false;
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
- return 0;
+ used = space_info->bytes_may_use + space_info->bytes_pinned;
+
+ /* The total flushable belongs to the global rsv, don't flush. */
+ if (global_rsv_size >= used)
+ return false;
+
+ /*
+ * 128MiB is 1/4 of the maximum global rsv size. If we have less than
+ * that devoted to other reservations then there's no sense in flushing,
+ * we don't have a lot of things that need flushing.
+ */
+ if (used - global_rsv_size <= SZ_128M)
+ return false;
+
+ /*
+ * We have tickets queued, bail so we don't compete with the async
+ * flushers.
+ */
+ if (space_info->reclaim_size)
+ return false;
+
+ /*
+ * If we have over half of the free space occupied by reservations or
+ * pinned then we want to start flushing.
+ *
+ * We do not do the traditional thing here, which is to say
+ *
+ * if (used >= ((total_bytes + avail) / 2))
+ * return 1;
+ *
+ * because this doesn't quite work how we want. If we had more than 50%
+ * of the space_info used by bytes_used and we had 0 available we'd just
+ * constantly run the background flusher. Instead we want it to kick in
+ * if our reclaimable space exceeds our clamped free space.
+ *
+ * Our clamping range is 2^1 -> 2^8. Practically speaking that means
+ * the following:
+ *
+ * Amount of RAM Minimum threshold Maximum threshold
+ *
+ * 256GiB 1GiB 128GiB
+ * 128GiB 512MiB 64GiB
+ * 64GiB 256MiB 32GiB
+ * 32GiB 128MiB 16GiB
+ * 16GiB 64MiB 8GiB
+ *
+ * These are the range our thresholds will fall in, corresponding to how
+ * much delalloc we need for the background flusher to kick in.
+ */
+
+ thresh = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_readonly + global_rsv_size;
+ if (used < total)
+ thresh += total - used;
+ thresh >>= space_info->clamp;
+
+ used = space_info->bytes_pinned;
+
+ /*
+ * If we have more ordered bytes than delalloc bytes then we're either
+ * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
+ * around. Preemptive flushing is only useful in that it can free up
+ * space before tickets need to wait for things to finish. In the case
+ * of ordered extents, preemptively waiting on ordered extents gets us
+ * nothing, if our reservations are tied up in ordered extents we'll
+ * simply have to slow down writers by forcing them to wait on ordered
+ * extents.
+ *
+ * In the case that ordered is larger than delalloc, only include the
+ * block reserves that we would actually be able to directly reclaim
+ * from. In this case if we're heavy on metadata operations this will
+ * clearly be heavy enough to warrant preemptive flushing. In the case
+ * of heavy DIO or ordered reservations, preemptive flushing will just
+ * waste time and cause us to slow down.
+ *
+ * We want to make sure we truly are maxed out on ordered however, so
+ * cut ordered in half, and if it's still higher than delalloc then we
+ * can keep flushing. This is to avoid the case where we start
+ * flushing, and now delalloc == ordered and we stop preemptively
+ * flushing when we could still have several gigs of delalloc to flush.
+ */
+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
+ delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
+ if (ordered >= delalloc)
+ used += fs_info->delayed_refs_rsv.reserved +
+ fs_info->delayed_block_rsv.reserved;
+ else
+ used += space_info->bytes_may_use - global_rsv_size;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
+static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (!ticket->steal)
+ return false;
+
+ if (global_rsv->space_info != space_info)
+ return false;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, 1);
+ if (global_rsv->reserved < min_bytes + ticket->bytes) {
+ spin_unlock(&global_rsv->lock);
+ return false;
+ }
+ global_rsv->reserved -= ticket->bytes;
+ remove_ticket(space_info, ticket);
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ space_info->tickets_id++;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ return true;
+}
+
/*
* maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
* @fs_info - fs_info for this fs
@@ -698,7 +1011,9 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
- u64 first_ticket_bytes = 0;
+ const bool aborted = BTRFS_FS_ERROR(fs_info);
+
+ trace_btrfs_fail_all_tickets(fs_info, space_info);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
@@ -710,27 +1025,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- /*
- * may_commit_transaction will avoid committing the transaction
- * if it doesn't feel like the space reclaimed by the commit
- * would result in the ticket succeeding. However if we have a
- * smaller ticket in the queue it may be small enough to be
- * satisified by committing the transaction, so if any
- * subsequent ticket is smaller than the first ticket go ahead
- * and send us back for another loop through the enospc flushing
- * code.
- */
- if (first_ticket_bytes == 0)
- first_ticket_bytes = ticket->bytes;
- else if (first_ticket_bytes > ticket->bytes)
+ if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
- list_del_init(&ticket->list);
- ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ if (aborted)
+ ticket->error = -EIO;
+ else
+ ticket->error = -ENOSPC;
wake_up(&ticket->wait);
/*
@@ -739,7 +1045,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
* here to see if we can make progress with the next ticket in
* the list.
*/
- btrfs_try_granting_tickets(fs_info, space_info);
+ if (!aborted)
+ btrfs_try_granting_tickets(fs_info, space_info);
}
return (tickets_id != space_info->tickets_id);
}
@@ -754,7 +1061,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
- int flush_state;
+ enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
@@ -773,7 +1080,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim, flush_state, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -792,6 +1099,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
+ /*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
* freed up a bunch of space and so have a lot of pinned space
@@ -821,9 +1136,229 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * This handles pre-flushing of metadata space before we get to the point that
+ * we need to start blocking threads on tickets. The logic here is different
+ * from the other flush paths because it doesn't rely on tickets to tell us how
+ * much we need to flush, instead it attempts to keep us below the 80% full
+ * watermark of space by flushing whichever reservation pool is currently the
+ * largest.
+ */
+static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv;
+ struct btrfs_block_rsv *trans_rsv;
+ int loops = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info,
+ preempt_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ delayed_block_rsv = &fs_info->delayed_block_rsv;
+ delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ global_rsv = &fs_info->global_block_rsv;
+ trans_rsv = &fs_info->trans_block_rsv;
+
+ spin_lock(&space_info->lock);
+ while (need_preemptive_reclaim(fs_info, space_info)) {
+ enum btrfs_flush_state flush;
+ u64 delalloc_size = 0;
+ u64 to_reclaim, block_rsv_size;
+ u64 global_rsv_size = global_rsv->reserved;
+
+ loops++;
+
+ /*
+ * We don't have a precise counter for the metadata being
+ * reserved for delalloc, so we'll approximate it by subtracting
+ * out the block rsv's space from the bytes_may_use. If that
+ * amount is higher than the individual reserves, then we can
+ * assume it's tied up in delalloc reservations.
+ */
+ block_rsv_size = global_rsv_size +
+ delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved +
+ trans_rsv->reserved;
+ if (block_rsv_size < space_info->bytes_may_use)
+ delalloc_size = space_info->bytes_may_use - block_rsv_size;
+
+ /*
+ * We don't want to include the global_rsv in our calculation,
+ * because that's space we can't touch. Subtract it from the
+ * block_rsv_size for the next checks.
+ */
+ block_rsv_size -= global_rsv_size;
+
+ /*
+ * We really want to avoid flushing delalloc too much, as it
+ * could result in poor allocation patterns, so only flush it if
+ * it's larger than the rest of the pools combined.
+ */
+ if (delalloc_size > block_rsv_size) {
+ to_reclaim = delalloc_size;
+ flush = FLUSH_DELALLOC;
+ } else if (space_info->bytes_pinned >
+ (delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved)) {
+ to_reclaim = space_info->bytes_pinned;
+ flush = COMMIT_TRANS;
+ } else if (delayed_block_rsv->reserved >
+ delayed_refs_rsv->reserved) {
+ to_reclaim = delayed_block_rsv->reserved;
+ flush = FLUSH_DELAYED_ITEMS_NR;
+ } else {
+ to_reclaim = delayed_refs_rsv->reserved;
+ flush = FLUSH_DELAYED_REFS_NR;
+ }
+
+ spin_unlock(&space_info->lock);
+
+ /*
+ * We don't want to reclaim everything, just a portion, so scale
+ * down the to_reclaim by 1/4. If it takes us down to 0,
+ * reclaim 1 items worth.
+ */
+ to_reclaim >>= 2;
+ if (!to_reclaim)
+ to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
+ flush_space(fs_info, space_info, to_reclaim, flush, true);
+ cond_resched();
+ spin_lock(&space_info->lock);
+ }
+
+ /* We only went through once, back off our clamping. */
+ if (loops == 1 && !space_info->reclaim_size)
+ space_info->clamp = max(1, space_info->clamp - 1);
+ trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by running the
+ * iputs
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_FULL,
+ RUN_DELAYED_IPUTS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ enum btrfs_flush_state flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state], false);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+
+ /* Something happened, fail everything and bail. */
+ if (BTRFS_FS_ERROR(fs_info))
+ goto aborted_fs;
+
+ }
+ spin_unlock(&space_info->lock);
+ }
+ return;
+
+aborted_fs:
+ maybe_fail_all_tickets(fs_info, space_info);
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
+ INIT_WORK(&fs_info->preempt_reclaim_work,
+ btrfs_preempt_reclaim_metadata_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -839,6 +1374,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
@@ -850,27 +1386,74 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int states_nr)
{
u64 to_reclaim;
- int flush_state;
+ int flush_state = 0;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
- if (!to_reclaim) {
+ /*
+ * This is the priority reclaim path, so to_reclaim could be >0 still
+ * because we may have only satisfied the priority tickets and still
+ * left non priority tickets on the list. We would then have
+ * to_reclaim but ->bytes == 0.
+ */
+ if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
- spin_unlock(&space_info->lock);
- flush_state = 0;
- do {
- flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
+ while (flush_state < states_nr) {
+ spin_unlock(&space_info->lock);
+ flush_space(fs_info, space_info, to_reclaim, states[flush_state],
+ false);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
+ }
+
+ /* Attempt to steal from the global rsv if we can. */
+ if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ }
+
+ /*
+ * We must run try_granting_tickets here because we could be a large
+ * ticket in front of a smaller ticket that can now be satisfied with
+ * the available space.
+ */
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ spin_lock(&space_info->lock);
+
+ /* We could have been granted before we got here. */
+ if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
- } while (flush_state < states_nr);
+ return;
+ }
+
+ while (!space_info->full) {
+ spin_unlock(&space_info->lock);
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ }
+
+ ticket->error = -ENOSPC;
+ remove_ticket(space_info, ticket);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
}
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
@@ -893,7 +1476,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
* despite getting an error, resulting in a space leak
* (bytes_may_use counter of our space_info).
*/
- list_del_init(&ticket->list);
+ remove_ticket(space_info, ticket);
ticket->error = -EINTR;
break;
}
@@ -908,11 +1491,14 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
}
/**
- * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
- * @fs_info - the fs
- * @space_info - the space_info for the reservation
- * @ticket - the ticket for the reservation
- * @flush - how much we can flush
+ * Do the appropriate flushing and waiting for a ticket
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info for the reservation
+ * @ticket: ticket for the reservation
+ * @start_ns: timestamp when the reservation started
+ * @orig_bytes: amount of bytes originally reserved
+ * @flush: how much we can flush
*
* This does the work of figuring out how to flush for the ticket, waiting for
* the reservation, and returning the appropriate error if there is one.
@@ -920,12 +1506,15 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
+ u64 start_ns, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
+ case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
@@ -938,24 +1527,15 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
}
- spin_lock(&space_info->lock);
ret = ticket->error;
- if (ticket->bytes || ticket->error) {
- /*
- * Need to delete here for priority tickets. For regular tickets
- * either the async reclaim job deletes the ticket from the list
- * or we delete it ourselves at wait_reserve_ticket().
- */
- list_del_init(&ticket->list);
- if (!ret)
- ret = -ENOSPC;
- }
- spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
/*
* Check that we can't have an error set if the reservation succeeded,
@@ -964,15 +1544,52 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* space wasn't reserved at all).
*/
ASSERT(!(ticket->bytes == 0 && ticket->error));
+ trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
+ start_ns, flush, ticket->error);
return ret;
}
+/*
+ * This returns true if this flush state will go through the ordinary flushing
+ * code.
+ */
+static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
+ (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+}
+
+static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+
+ /*
+ * If we're heavy on ordered operations then clamping won't help us. We
+ * need to clamp specifically to keep up with dirty'ing buffered
+ * writers, because there's not a 1:1 correlation of writing delalloc
+ * and freeing space, like there is with flushing delayed refs or
+ * delayed nodes. If we're already more ordered than delalloc then
+ * we're keeping up, otherwise we aren't and should probably clamp.
+ */
+ if (ordered < delalloc)
+ space_info->clamp = min(space_info->clamp + 1, 8);
+}
+
+static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
+{
+ return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_EVICT);
+}
+
/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * Try to reserve bytes from the block_rsv's space
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info we want to allocate from
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -981,12 +1598,13 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
+ u64 start_ns = 0;
u64 used;
int ret = 0;
bool pending_tickets;
@@ -994,18 +1612,32 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
- pending_tickets = !list_empty(&space_info->tickets) ||
- !list_empty(&space_info->priority_tickets);
+
+ /*
+ * We don't want NO_FLUSH allocations to jump everybody, they can
+ * generally handle ENOSPC in a different way, so treat them the same as
+ * normal flushers when it comes to skipping pending tickets.
+ */
+ if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
+ pending_tickets = !list_empty(&space_info->tickets) ||
+ !list_empty(&space_info->priority_tickets);
+ else
+ pending_tickets = !list_empty(&space_info->priority_tickets);
/*
* Carry on if we have enough space (short-circuit) OR call
* can_overcommit() to ensure we can overcommit to continue.
*/
if (!pending_tickets &&
- ((used + orig_bytes <= space_info->total_bytes) ||
+ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
@@ -1022,51 +1654,67 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
ticket.bytes = orig_bytes;
ticket.error = 0;
+ space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
- if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+ ticket.steal = can_steal(flush);
+ if (trace_btrfs_reserve_ticket_enabled())
+ start_ns = ktime_get_ns();
+
+ if (flush == BTRFS_RESERVE_FLUSH_ALL ||
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
+ /*
+ * We were forced to add a reserve ticket, so
+ * our preemptive flushing is unable to keep
+ * up. Clamp down on the threshold for the
+ * preemptive flushing in order to keep up with
+ * the workload.
+ */
+ maybe_clamp_preempt(fs_info, space_info);
+
space_info->flush = 1;
trace_btrfs_trigger_flush(fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
&space_info->priority_tickets);
}
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
/*
* We will do the space reservation dance during log replay,
* which means we won't have fs_info->fs_root set, so don't do
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(fs_info, space_info, used) &&
- !work_busy(&fs_info->async_reclaim_work)) {
+ !work_busy(&fs_info->preempt_reclaim_work) &&
+ need_preemptive_reclaim(fs_info, space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ &fs_info->preempt_reclaim_work);
}
}
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
return ret;
- return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
+ return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
+ orig_bytes, flush);
}
/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * Trye to reserve metadata bytes from the block_rsv's space
+ *
+ * @fs_info: the filesystem
+ * @block_rsv: block_rsv we're allocating for
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -1075,23 +1723,14 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
- if (ret == -ENOSPC &&
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
- if (block_rsv != global_rsv &&
- !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
- ret = 0;
- }
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
@@ -1103,3 +1742,48 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
return ret;
}
+
+/**
+ * Try to reserve data bytes for an allocation
+ *
+ * @fs_info: the filesystem
+ * @bytes: number of bytes we need
+ * @flush: how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_NO_FLUSH);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ btrfs_info(fs_info, "dumping space info:");
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ spin_lock(&space_info->lock);
+ __btrfs_dump_space_info(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+ dump_global_block_rsv(fs_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 24514cd2c6c1..ce66023a9eb8 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include "volumes.h"
+
struct btrfs_space_info {
spinlock_t lock;
@@ -17,10 +19,26 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ /* Total bytes in the space, but only accounts active block groups. */
+ u64 active_total_bytes;
+ u64 bytes_zone_unusable; /* total bytes that are unusable until
+ resetting the device zone */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
+ /* Chunk size in bytes */
+ u64 chunk_size;
+
+ /*
+ * Once a block group drops below this threshold (percents) we'll
+ * schedule it for reclaim.
+ */
+ int bg_reclaim_threshold;
+
+ int clamp; /* Used to scale our threshold for preemptive
+ flushing. The value is >> clamp, so turns
+ out to be a 2^clamp divisor. */
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
@@ -37,23 +55,18 @@ struct btrfs_space_info {
u64 flags;
- /*
- * bytes_pinned is kept in line with what is actually pinned, as in
- * we've called update_block_group and dropped the bytes_used counter
- * and increased the bytes_pinned counter. However this means that
- * bytes_pinned does not reflect the bytes that will be pinned once the
- * delayed refs are flushed, so this counter is inc'ed every time we
- * call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zeroed every
- * time the transaction commits.
- */
- struct percpu_counter total_bytes_pinned;
-
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
+
+ /*
+ * Size of space that needs to be reclaimed in order to satisfy pending
+ * tickets
+ */
+ u64 reclaim_size;
+
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.
@@ -71,6 +84,7 @@ struct btrfs_space_info {
struct reserve_ticket {
u64 bytes;
int error;
+ bool steal;
struct list_head list;
wait_queue_head_t wait;
};
@@ -109,10 +123,10 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
- struct btrfs_space_info **space_info);
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group);
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
@@ -121,7 +135,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
+int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
@@ -141,5 +155,9 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
+void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 73f7987143df..12455b2b41de 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,161 +7,149 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
+static bool check_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
{
- return *(u8 *)p;
-}
+ const unsigned long member_offset = (unsigned long)ptr + off;
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
+ if (unlikely(member_offset + size > eb->len)) {
+ btrfs_warn(eb->fs_info,
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
+ (unsigned long)ptr, eb->start, member_offset, size);
+ return false;
+ }
+
+ return true;
}
/*
- * this is some deeply nasty code.
+ * Macro templates that define helpers to read/write extent buffer data of a
+ * given size, that are also used via ctree.h for access to item members by
+ * specialized helpers.
*
- * The end result is that anyone who #includes ctree.h gets a
- * declaration for the btrfs_set_foo functions and btrfs_foo functions,
- * which are wrappers of btrfs_set_token_#bits functions and
- * btrfs_get_token_#bits functions, which are defined in this file.
+ * Generic helpers:
+ * - btrfs_set_8 (for 8/16/32/64)
+ * - btrfs_get_8 (for 8/16/32/64)
*
- * These setget functions do all the extent_buffer related mapping
- * required to efficiently read and write specific fields in the extent
- * buffers. Every pointer to metadata items in btrfs is really just
- * an unsigned long offset into the extent buffer which has been
- * cast to a specific type. This gives us all the gcc type checking.
+ * Generic helpers with a token (cached address of the most recently accessed
+ * page):
+ * - btrfs_set_token_8 (for 8/16/32/64)
+ * - btrfs_get_token_8 (for 8/16/32/64)
*
- * The extent buffer api is used to do the page spanning work required to
- * have a metadata blocksize different from the page size.
+ * The set/get functions handle data spanning two pages transparently, in case
+ * metadata block size is larger than page. Every pointer to metadata items is
+ * an offset into the extent buffer page array, cast to a specific type. This
+ * gives us all the type checking.
*
- * There are 2 variants defined, one with a token pointer and one without.
+ * The extent buffer pages stored in the array pages do not form a contiguous
+ * phyusical range, but the API functions assume the linear offset to the range
+ * from 0 to metadata node size.
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \
- const void *ptr, unsigned long off, \
- struct btrfs_map_token *token) \
+u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
+ const void *ptr, unsigned long off) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
- u##bits res; \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
+ const int size = sizeof(u##bits); \
+ u8 lebytes[sizeof(u##bits)]; \
+ const int part = PAGE_SIZE - oip; \
\
ASSERT(token); \
- ASSERT(token->eb == eb); \
- \
- if (token->kaddr && token->offset <= offset && \
- (token->offset + PAGE_SIZE >= offset + size)) { \
- kaddr = token->kaddr; \
- p = kaddr + part_offset - token->offset; \
- res = get_unaligned_le##bits(p + off); \
- return res; \
+ ASSERT(token->kaddr); \
+ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
+ if (token->offset <= member_offset && \
+ member_offset + size <= token->offset + PAGE_SIZE) { \
+ return get_unaligned_le##bits(token->kaddr + oip); \
} \
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits leres; \
+ token->kaddr = page_address(token->eb->pages[idx]); \
+ token->offset = idx << PAGE_SHIFT; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
+ return get_unaligned_le##bits(token->kaddr + oip); \
\
- read_extent_buffer(eb, &leres, offset, size); \
- return le##bits##_to_cpu(leres); \
- } \
- p = kaddr + part_offset - map_start; \
- res = get_unaligned_le##bits(p + off); \
- token->kaddr = kaddr; \
- token->offset = map_start; \
- return res; \
+ memcpy(lebytes, token->kaddr + oip, part); \
+ token->kaddr = page_address(token->eb->pages[idx + 1]); \
+ token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes + part, token->kaddr, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
- u##bits res; \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ char *kaddr = page_address(eb->pages[idx]); \
+ const int size = sizeof(u##bits); \
+ const int part = PAGE_SIZE - oip; \
+ u8 lebytes[sizeof(u##bits)]; \
\
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits leres; \
+ ASSERT(check_setget_bounds(eb, ptr, off, size)); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
+ return get_unaligned_le##bits(kaddr + oip); \
\
- read_extent_buffer(eb, &leres, offset, size); \
- return le##bits##_to_cpu(leres); \
- } \
- p = kaddr + part_offset - map_start; \
- res = get_unaligned_le##bits(p + off); \
- return res; \
+ memcpy(lebytes, kaddr + oip, part); \
+ kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes + part, kaddr, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
-void btrfs_set_token_##bits(struct extent_buffer *eb, \
+void btrfs_set_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off, \
- u##bits val, \
- struct btrfs_map_token *token) \
+ u##bits val) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
+ const int size = sizeof(u##bits); \
+ u8 lebytes[sizeof(u##bits)]; \
+ const int part = PAGE_SIZE - oip; \
\
ASSERT(token); \
- ASSERT(token->eb == eb); \
- \
- if (token->kaddr && token->offset <= offset && \
- (token->offset + PAGE_SIZE >= offset + size)) { \
- kaddr = token->kaddr; \
- p = kaddr + part_offset - token->offset; \
- put_unaligned_le##bits(val, p + off); \
+ ASSERT(token->kaddr); \
+ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
+ if (token->offset <= member_offset && \
+ member_offset + size <= token->offset + PAGE_SIZE) { \
+ put_unaligned_le##bits(val, token->kaddr + oip); \
return; \
} \
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits val2; \
- \
- val2 = cpu_to_le##bits(val); \
- write_extent_buffer(eb, &val2, offset, size); \
+ token->kaddr = page_address(token->eb->pages[idx]); \
+ token->offset = idx << PAGE_SHIFT; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
+ put_unaligned_le##bits(val, token->kaddr + oip); \
return; \
} \
- p = kaddr + part_offset - map_start; \
- put_unaligned_le##bits(val, p + off); \
- token->kaddr = kaddr; \
- token->offset = map_start; \
+ put_unaligned_le##bits(val, lebytes); \
+ memcpy(token->kaddr + oip, lebytes, part); \
+ token->kaddr = page_address(token->eb->pages[idx + 1]); \
+ token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr, lebytes + part, size - part); \
} \
-void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
- unsigned long part_offset = (unsigned long)ptr; \
- unsigned long offset = part_offset + off; \
- void *p; \
- int err; \
- char *kaddr; \
- unsigned long map_start; \
- unsigned long map_len; \
- int size = sizeof(u##bits); \
+ const unsigned long member_offset = (unsigned long)ptr + off; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ char *kaddr = page_address(eb->pages[idx]); \
+ const int size = sizeof(u##bits); \
+ const int part = PAGE_SIZE - oip; \
+ u8 lebytes[sizeof(u##bits)]; \
\
- err = map_private_extent_buffer(eb, offset, size, \
- &kaddr, &map_start, &map_len); \
- if (err) { \
- __le##bits val2; \
- \
- val2 = cpu_to_le##bits(val); \
- write_extent_buffer(eb, &val2, offset, size); \
+ ASSERT(check_setget_bounds(eb, ptr, off, size)); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
+ put_unaligned_le##bits(val, kaddr + oip); \
return; \
} \
- p = kaddr + part_offset - map_start; \
- put_unaligned_le##bits(val, p + off); \
+ \
+ put_unaligned_le##bits(val, lebytes); \
+ memcpy(kaddr + oip, lebytes, part); \
+ kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr, lebytes + part, size - part); \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
new file mode 100644
index 000000000000..9a176af847d7
--- /dev/null
+++ b/fs/btrfs/subpage.c
@@ -0,0 +1,768 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include "ctree.h"
+#include "subpage.h"
+#include "btrfs_inode.h"
+
+/*
+ * Subpage (sectorsize < PAGE_SIZE) support overview:
+ *
+ * Limitations:
+ *
+ * - Only support 64K page size for now
+ * This is to make metadata handling easier, as 64K page would ensure
+ * all nodesize would fit inside one page, thus we don't need to handle
+ * cases where a tree block crosses several pages.
+ *
+ * - Only metadata read-write for now
+ * The data read-write part is in development.
+ *
+ * - Metadata can't cross 64K page boundary
+ * btrfs-progs and kernel have done that for a while, thus only ancient
+ * filesystems could have such problem. For such case, do a graceful
+ * rejection.
+ *
+ * Special behavior:
+ *
+ * - Metadata
+ * Metadata read is fully supported.
+ * Meaning when reading one tree block will only trigger the read for the
+ * needed range, other unrelated range in the same page will not be touched.
+ *
+ * Metadata write support is partial.
+ * The writeback is still for the full page, but we will only submit
+ * the dirty extent buffers in the page.
+ *
+ * This means, if we have a metadata page like this:
+ *
+ * Page offset
+ * 0 16K 32K 48K 64K
+ * |/////////| |///////////|
+ * \- Tree block A \- Tree block B
+ *
+ * Even if we just want to writeback tree block A, we will also writeback
+ * tree block B if it's also dirty.
+ *
+ * This may cause extra metadata writeback which results more COW.
+ *
+ * Implementation:
+ *
+ * - Common
+ * Both metadata and data will use a new structure, btrfs_subpage, to
+ * record the status of each sector inside a page. This provides the extra
+ * granularity needed.
+ *
+ * - Metadata
+ * Since we have multiple tree blocks inside one page, we can't rely on page
+ * locking anymore, or we will have greatly reduced concurrency or even
+ * deadlocks (hold one tree lock while trying to lock another tree lock in
+ * the same page).
+ *
+ * Thus for metadata locking, subpage support relies on io_tree locking only.
+ * This means a slightly higher tree locking latency.
+ */
+
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+{
+ if (fs_info->sectorsize >= PAGE_SIZE)
+ return false;
+
+ /*
+ * Only data pages (either through DIO or compression) can have no
+ * mapping. And if page->mapping->host is data inode, it's subpage.
+ * As we have ruled our sectorsize >= PAGE_SIZE case already.
+ */
+ if (!page->mapping || !page->mapping->host ||
+ is_data_inode(page->mapping->host))
+ return true;
+
+ /*
+ * Now the only remaining case is metadata, which we only go subpage
+ * routine if nodesize < PAGE_SIZE.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return true;
+ return false;
+}
+
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+ unsigned int cur = 0;
+ unsigned int nr_bits;
+
+ ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+ nr_bits = PAGE_SIZE / sectorsize;
+ subpage_info->bitmap_nr_bits = nr_bits;
+
+ subpage_info->uptodate_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->error_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->dirty_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->writeback_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->ordered_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->checked_offset = cur;
+ cur += nr_bits;
+
+ subpage_info->total_nr_bits = cur;
+}
+
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type)
+{
+ struct btrfs_subpage *subpage;
+
+ /*
+ * We have cases like a dummy extent buffer page, which is not mapped
+ * and doesn't need to be locked.
+ */
+ if (page->mapping)
+ ASSERT(PageLocked(page));
+
+ /* Either not subpage, or the page already has private attached */
+ if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ return 0;
+
+ subpage = btrfs_alloc_subpage(fs_info, type);
+ if (IS_ERR(subpage))
+ return PTR_ERR(subpage);
+
+ attach_page_private(page, subpage);
+ return 0;
+}
+
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ /* Either not subpage, or already detached */
+ if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ return;
+
+ subpage = detach_page_private(page);
+ ASSERT(subpage);
+ btrfs_free_subpage(subpage);
+}
+
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type)
+{
+ struct btrfs_subpage *ret;
+ unsigned int real_size;
+
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+ real_size = struct_size(ret, bitmaps,
+ BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ ret = kzalloc(real_size, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&ret->lock);
+ if (type == BTRFS_SUBPAGE_METADATA) {
+ atomic_set(&ret->eb_refs, 0);
+ } else {
+ atomic_set(&ret->readers, 0);
+ atomic_set(&ret->writers, 0);
+ }
+ return ret;
+}
+
+void btrfs_free_subpage(struct btrfs_subpage *subpage)
+{
+ kfree(subpage);
+}
+
+/*
+ * Increase the eb_refs of current subpage.
+ *
+ * This is important for eb allocation, to prevent race with last eb freeing
+ * of the same page.
+ * With the eb_refs increased before the eb inserted into radix tree,
+ * detach_extent_buffer_page() won't detach the page private while we're still
+ * allocating the extent buffer.
+ */
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ atomic_inc(&subpage->eb_refs);
+}
+
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(atomic_read(&subpage->eb_refs));
+ atomic_dec(&subpage->eb_refs);
+}
+
+static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ /* Basic checks */
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
+ /*
+ * The range check only works for mapped page, we can still have
+ * unmapped page like dummy extent buffer pages.
+ */
+ if (page->mapping)
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+}
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ atomic_add(nbits, &subpage->readers);
+}
+
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+ bool is_data;
+ bool last;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+ is_data = is_data_inode(page->mapping->host);
+ ASSERT(atomic_read(&subpage->readers) >= nbits);
+ last = atomic_sub_and_test(nbits, &subpage->readers);
+
+ /*
+ * For data we need to unlock the page if the last read has finished.
+ *
+ * And please don't replace @last with atomic_sub_and_test() call
+ * inside if () condition.
+ * As we want the atomic_sub_and_test() to be always executed.
+ */
+ if (is_data && last)
+ unlock_page(page);
+}
+
+static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+{
+ u64 orig_start = *start;
+ u32 orig_len = *len;
+
+ *start = max_t(u64, page_offset(page), orig_start);
+ /*
+ * For certain call sites like btrfs_drop_pages(), we may have pages
+ * beyond the target range. In that case, just set @len to 0, subpage
+ * helpers can handle @len == 0 without any problem.
+ */
+ if (page_offset(page) >= orig_start + orig_len)
+ *len = 0;
+ else
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
+}
+
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+ int ret;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ASSERT(atomic_read(&subpage->readers) == 0);
+ ret = atomic_add_return(nbits, &subpage->writers);
+ ASSERT(ret == nbits);
+}
+
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ /*
+ * We have call sites passing @lock_page into
+ * extent_clear_unlock_delalloc() for compression path.
+ *
+ * This @locked_page is locked by plain lock_page(), thus its
+ * subpage::writers is 0. Handle them in a special way.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ return true;
+
+ ASSERT(atomic_read(&subpage->writers) >= nbits);
+ return atomic_sub_and_test(nbits, &subpage->writers);
+}
+
+/*
+ * Lock a page for delalloc page writeback.
+ *
+ * Return -EAGAIN if the page is not properly initialized.
+ * Return 0 with the page locked, and writer counter updated.
+ *
+ * Even with 0 returned, the page still need extra check to make sure
+ * it's really the correct page, as the caller is using
+ * filemap_get_folios_contig(), which can race with page invalidating.
+ */
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
+ lock_page(page);
+ return 0;
+ }
+ lock_page(page);
+ if (!PagePrivate(page) || !page->private) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ btrfs_subpage_clamp_range(page, &start, &len);
+ btrfs_subpage_start_writer(fs_info, page, start, len);
+ return 0;
+}
+
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
+ return unlock_page(page);
+ btrfs_subpage_clamp_range(page, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
+ unlock_page(page);
+}
+
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_zero;
+
+ found_zero = find_next_zero_bit(addr, start + nbits, start);
+ if (found_zero == start + nbits)
+ return true;
+ return false;
+}
+
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+ unsigned int nbits)
+{
+ unsigned int found_set;
+
+ found_set = find_next_bit(addr, start + nbits, start);
+ if (found_set == start + nbits)
+ return true;
+ return false;
+}
+
+#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+({ \
+ unsigned int start_bit; \
+ \
+ btrfs_subpage_assert(fs_info, page, start, len); \
+ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ start_bit += fs_info->subpage_info->name##_offset; \
+ start_bit; \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+ bitmap_test_range_all_set(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+ bitmap_test_range_all_zero(subpage->bitmaps, \
+ fs_info->subpage_info->name##_offset, \
+ fs_info->subpage_info->bitmap_nr_bits)
+
+void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
+ SetPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ uptodate, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ SetPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ error, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
+ ClearPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ set_page_dirty(page);
+}
+
+/*
+ * Extra clear_and_test function for subpage dirty bitmap.
+ *
+ * Return true if we're the last bits in the dirty_bitmap and clear the
+ * dirty_bitmap.
+ * Return false otherwise.
+ *
+ * NOTE: Callers should manually clear page dirty for true case, as we have
+ * extra handling for tree blocks.
+ */
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ dirty, start, len);
+ unsigned long flags;
+ bool last = false;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
+ last = true;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
+}
+
+void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ bool last;
+
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ if (last)
+ clear_page_dirty_for_io(page);
+}
+
+void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ set_page_writeback(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ writeback, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
+ ASSERT(PageWriteback(page));
+ end_page_writeback(page);
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ SetPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ ordered, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
+ ClearPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ SetPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ checked, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ ClearPageChecked(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+/*
+ * Unlike set/clear which is dependent on each page status, for test all bits
+ * are tested in the same way.
+ */
+#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ name, start, len); \
+ unsigned long flags; \
+ bool ret; \
+ \
+ spin_lock_irqsave(&subpage->lock, flags); \
+ ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ len >> fs_info->sectorsize_bits); \
+ spin_unlock_irqrestore(&subpage->lock, flags); \
+ return ret; \
+}
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
+
+/*
+ * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
+ * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
+ * back to regular sectorsize branch.
+ */
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
+ test_page_func) \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
+ return test_page_func(page); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
+ return test_page_func(page); \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
+ PageUptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
+ PageDirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
+ PageWriteback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
+ PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+
+/*
+ * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
+ * is cleared.
+ */
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ASSERT(!PageDirty(page));
+ if (!btrfs_is_subpage(fs_info, page))
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ * It should not have any subpage::writers count.
+ * Can be unlocked by unlock_page().
+ * This is the most common locked page for __extent_writepage() called
+ * inside extent_write_cache_pages().
+ * Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ * There is only one caller, all pages except @locked_page for
+ * extent_write_locked_range().
+ * In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+
+ ASSERT(PageLocked(page));
+ /* For non-subpage case, we just unlock the page */
+ if (!btrfs_is_subpage(fs_info, page))
+ return unlock_page(page);
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers) == 0)
+ /* No writers, locked by plain lock_page() */
+ return unlock_page(page);
+
+ /* Have writers, use proper subpage helper to end it */
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
new file mode 100644
index 000000000000..0e80ad336904
--- /dev/null
+++ b/fs/btrfs/subpage.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUBPAGE_H
+#define BTRFS_SUBPAGE_H
+
+#include <linux/spinlock.h>
+
+/*
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset /- error_offset /- dirty_offset
+ * | | |
+ * v v v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
+ */
+struct btrfs_subpage_info {
+ /* Number of bits for each bitmap */
+ unsigned int bitmap_nr_bits;
+
+ /* Total number of bits for the whole bitmap */
+ unsigned int total_nr_bits;
+
+ /*
+ * *_start indicates where the bitmap starts, the length is always
+ * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+ */
+ unsigned int uptodate_offset;
+ unsigned int error_offset;
+ unsigned int dirty_offset;
+ unsigned int writeback_offset;
+ unsigned int ordered_offset;
+ unsigned int checked_offset;
+};
+
+/*
+ * Structure to trace status of each sector inside a page, attached to
+ * page::private for both data and metadata inodes.
+ */
+struct btrfs_subpage {
+ /* Common members for both data and metadata pages */
+ spinlock_t lock;
+ /*
+ * Both data and metadata needs to track how many readers are for the
+ * page.
+ * Data relies on @readers to unlock the page when last reader finished.
+ * While metadata doesn't need page unlock, it needs to prevent
+ * page::private get cleared before the last end_page_read().
+ */
+ atomic_t readers;
+ union {
+ /*
+ * Structures only used by metadata
+ *
+ * @eb_refs should only be operated under private_lock, as it
+ * manages whether the subpage can be detached.
+ */
+ atomic_t eb_refs;
+
+ /* Structures only used by data */
+ atomic_t writers;
+ };
+ unsigned long bitmaps[];
+};
+
+enum btrfs_subpage_type {
+ BTRFS_SUBPAGE_METADATA,
+ BTRFS_SUBPAGE_DATA,
+};
+
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+/* Allocate additional data where page represents more than one sector */
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ enum btrfs_subpage_type type);
+void btrfs_free_subpage(struct btrfs_subpage *subpage);
+
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+/*
+ * Template for subpage related operations.
+ *
+ * btrfs_subpage_*() are for call sites where the page has subpage attached and
+ * the range is ensured to be inside the page.
+ *
+ * btrfs_page_*() are for call sites where the page can either be subpage
+ * specific or regular page. The function will handle both cases.
+ * But the range still needs to be inside the page.
+ *
+ * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * need to be inside the page. Those functions will truncate the range
+ * automatically.
+ */
+#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
+void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len);
+
+DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
+DECLARE_BTRFS_SUBPAGE_OPS(error);
+DECLARE_BTRFS_SUBPAGE_OPS(dirty);
+DECLARE_BTRFS_SUBPAGE_OPS(writeback);
+DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
+
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+ u64 start, u32 len);
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67c63858812a..5942b9384088 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -23,7 +23,6 @@
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
-#include <linux/cleancache.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
@@ -44,11 +43,12 @@
#include "backref.h"
#include "space-info.h"
#include "sysfs.h"
+#include "zoned.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
-
#include "qgroup.h"
+#include "raid56.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -67,28 +67,98 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem state.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
+/*
+ * Generally the error codes correspond to their respective errors, but there
+ * are a few special cases.
+ *
+ * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
+ * instance will return EUCLEAN if any of the blocks are corrupted in
+ * a way that is problematic. We want to reserve EUCLEAN for these
+ * sort of corruptions.
+ *
+ * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
+ * need to use EROFS for this case. We will have no idea of the
+ * original failure, that will have been reported at the time we tripped
+ * over the error. Each subsequent error that doesn't have any context
+ * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
+ */
const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
switch (errno) {
- case -EIO:
+ case -ENOENT: /* -2 */
+ errstr = "No such entry";
+ break;
+ case -EIO: /* -5 */
errstr = "IO failure";
break;
- case -ENOMEM:
+ case -ENOMEM: /* -12*/
errstr = "Out of memory";
break;
- case -EROFS:
- errstr = "Readonly filesystem";
- break;
- case -EEXIST:
+ case -EEXIST: /* -17 */
errstr = "Object already exists";
break;
- case -ENOSPC:
+ case -ENOSPC: /* -28 */
errstr = "No space left";
break;
- case -ENOENT:
- errstr = "No such entry";
+ case -EROFS: /* -30 */
+ errstr = "Readonly filesystem";
+ break;
+ case -EOPNOTSUPP: /* -95 */
+ errstr = "Operation not supported";
+ break;
+ case -EUCLEAN: /* -117 */
+ errstr = "Filesystem corrupted";
+ break;
+ case -EDQUOT: /* -122 */
+ errstr = "Quota exceeded";
break;
}
@@ -105,6 +175,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
const char *errstr;
#endif
@@ -117,6 +188,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
va_list args;
@@ -125,12 +197,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.fmt = fmt;
vaf.va = &args;
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, function, line, errno, errstr, &vaf);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
va_end(args);
} else {
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
- sb->s_id, function, line, errno, errstr);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
}
#endif
@@ -150,7 +222,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
btrfs_discard_stop(fs_info);
/* btrfs handle error by forcing the filesystem readonly */
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not canceled here
@@ -190,7 +262,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -216,14 +288,48 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
vaf.fmt = fmt;
vaf.va = &args;
- if (__ratelimit(ratelimit))
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
- fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
+ if (__ratelimit(ratelimit)) {
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
+ }
va_end(args);
}
#endif
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+ btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+ btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_warn(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+ btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+ btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_err(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+#endif
+
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
@@ -240,23 +346,14 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno)
+ unsigned int line, int errno, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- trans->aborted = errno;
- /* Nothing used. The other threads that have joined this
- * transaction may be able to continue. */
- if (!trans->dirty && list_empty(&trans->new_bgs)) {
- const char *errstr;
-
- errstr = btrfs_decode_error(errno);
- btrfs_warn(fs_info,
- "%s:%d: Aborting unused transaction(%s).",
- function, line, errstr);
- return;
- }
+ WRITE_ONCE(trans->aborted, errno);
WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
@@ -309,7 +406,6 @@ enum {
Opt_device,
Opt_fatal_errors,
Opt_flushoncommit, Opt_noflushoncommit,
- Opt_inode_cache, Opt_noinode_cache,
Opt_max_inline,
Opt_barrier, Opt_nobarrier,
Opt_datacow, Opt_nodatacow,
@@ -317,7 +413,6 @@ enum {
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
Opt_discard_mode,
- Opt_nologreplay,
Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
@@ -331,13 +426,19 @@ enum {
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
- Opt_usebackuproot,
Opt_user_subvol_rm_allowed,
+ /* Rescue options */
+ Opt_rescue,
+ Opt_usebackuproot,
+ Opt_nologreplay,
+ Opt_ignorebadroots,
+ Opt_ignoredatacsums,
+ Opt_rescue_all,
+
/* Deprecated options */
- Opt_alloc_start,
Opt_recovery,
- Opt_subvolrootid,
+ Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
Opt_check_integrity,
@@ -381,7 +482,6 @@ static const match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
- {Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
@@ -399,13 +499,17 @@ static const match_table_t tokens = {
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
{Opt_notreelog, "notreelog"},
- {Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+ /* Rescue options */
+ {Opt_rescue, "rescue=%s"},
+ /* Deprecated, with alias rescue=nologreplay */
+ {Opt_nologreplay, "nologreplay"},
+ /* Deprecated, with alias rescue=usebackuproot */
+ {Opt_usebackuproot, "usebackuproot"},
+
/* Deprecated options */
- {Opt_alloc_start, "alloc_start=%s"},
{Opt_recovery, "recovery"},
- {Opt_subvolrootid, "subvolrootid=%d"},
/* Debugging options */
{Opt_check_integrity, "check_int"},
@@ -424,6 +528,88 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+static const match_table_t rescue_tokens = {
+ {Opt_usebackuproot, "usebackuproot"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_ignorebadroots, "ignorebadroots"},
+ {Opt_ignorebadroots, "ibadroots"},
+ {Opt_ignoredatacsums, "ignoredatacsums"},
+ {Opt_ignoredatacsums, "idatacsums"},
+ {Opt_rescue_all, "all"},
+ {Opt_err, NULL},
+};
+
+static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
+ const char *opt_name)
+{
+ if (fs_info->mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
+
+static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+{
+ char *opts;
+ char *orig;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int ret = 0;
+
+ opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+ orig = opts;
+
+ while ((p = strsep(&opts, ":")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+ token = match_token(p, rescue_tokens, args);
+ switch (token){
+ case Opt_usebackuproot:
+ btrfs_info(info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ break;
+ case Opt_nologreplay:
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_ignorebadroots:
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ break;
+ case Opt_ignoredatacsums:
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ break;
+ case Opt_rescue_all:
+ btrfs_info(info, "enabling all of the rescue options");
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
+ case Opt_err:
+ btrfs_info(info, "unrecognized rescue option '%s'", p);
+ ret = -EINVAL;
+ goto out;
+ default:
+ break;
+ }
+
+ }
+out:
+ kfree(orig);
+ return ret;
+}
+
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
@@ -434,20 +620,27 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
{
substring_t args[MAX_OPT_ARGS];
char *p, *num;
- u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
+ int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
+ const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
- cache_gen = btrfs_super_cache_generation(info->super_copy);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (cache_gen)
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ else if (btrfs_free_space_cache_v1_active(info)) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_info(info,
+ "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(info->super_copy, 0);
+ } else {
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ }
+ }
/*
* Even the options are empty, we still need to do extra check
@@ -470,7 +663,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_subvol:
case Opt_subvol_empty:
case Opt_subvolid:
- case Opt_subvolrootid:
case Opt_device:
/*
* These are parsed by btrfs_parse_subvol_options or
@@ -514,7 +706,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
- /* Fallthrough */
+ fallthrough;
case Opt_compress:
case Opt_compress_type:
saved_compress_type = btrfs_test_opt(info,
@@ -522,6 +714,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
+ saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
@@ -547,6 +740,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -566,11 +760,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
+ info->compress_level = 0;
+ info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
no_compress++;
} else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -586,11 +784,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(info, COMPRESS) &&
- (info->compress_type != saved_compress_type ||
- compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(info, COMPRESS) &&
- no_compress == 1)) {
+ if (no_compress == 1) {
+ btrfs_info(info, "use no compression");
+ } else if ((info->compress_type != saved_compress_type) ||
+ (compress_force != saved_compress_force) ||
+ (info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
@@ -613,7 +811,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
- /* Fallthrough */
+ fallthrough;
case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
@@ -629,8 +827,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
goto out;
} else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
ret = -EINVAL;
goto out;
}
@@ -654,10 +855,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
}
break;
- case Opt_alloc_start:
- btrfs_info(info,
- "option alloc_start is obsolete, ignored");
- break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
info->sb->s_flags |= SB_POSIXACL;
@@ -680,6 +877,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_norecovery:
case Opt_nologreplay:
+ btrfs_warn(info,
+ "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
@@ -693,8 +892,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
goto out;
+ }
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
@@ -711,6 +913,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -723,6 +927,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_space_cache:
case Opt_space_cache_version:
+ /*
+ * We already set FREE_SPACE_TREE above because we have
+ * compat_ro(FREE_SPACE_TREE) set, and we aren't going
+ * to allow v1 to be set for extent tree v2, simply
+ * ignore this setting if we're extent tree v2.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (token == Opt_space_cache ||
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(info->mount_opt,
@@ -735,6 +947,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -743,6 +957,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
+ /*
+ * We cannot operate without the free space tree with
+ * extent tree v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (btrfs_test_opt(info, SPACE_CACHE)) {
btrfs_clear_and_info(info, SPACE_CACHE,
"disabling disk space caching");
@@ -753,14 +973,17 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_inode_cache:
- btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
- "enabling inode map caching");
- break;
case Opt_noinode_cache:
- btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
- "disabling inode map caching");
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
+ /*
+ * We cannot clear the free space tree with extent tree
+ * v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
@@ -782,10 +1005,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
"disabling auto defrag");
break;
case Opt_recovery:
- btrfs_warn(info,
- "'recovery' is deprecated, use 'usebackuproot' instead");
- /* fall through */
case Opt_usebackuproot:
+ btrfs_warn(info,
+ "'%s' is deprecated, use 'rescue=usebackuproot' instead",
+ token == Opt_recovery ? "recovery" :
+ "usebackuproot");
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
@@ -797,8 +1021,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_check_integrity_including_extent_data:
btrfs_info(info,
"enabling check integrity including extent data");
- btrfs_set_opt(info->mount_opt,
- CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
@@ -807,8 +1030,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
goto out;
+ }
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
@@ -823,13 +1050,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
#endif
case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0)
+ if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else if (strcmp(args[0].from, "bug") == 0)
+ } else if (strcmp(args[0].from, "bug") == 0) {
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else {
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -837,8 +1066,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
goto out;
+ }
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
@@ -850,6 +1083,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
info->commit_interval = intarg;
break;
+ case Opt_rescue:
+ ret = parse_rescue_options(info, args[0].from);
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
+ goto out;
+ }
+ break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -873,7 +1114,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
#endif
case Opt_err:
- btrfs_info(info, "unrecognized mount option '%s'", p);
+ btrfs_err(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
@@ -881,14 +1122,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
}
check:
- /*
- * Extra check for current option against current flag
- */
- if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
- btrfs_err(info,
- "nologreplay must be used with ro mount option");
+ /* We're read-only, don't have to check. */
+ if (new_flags & SB_RDONLY)
+ goto out;
+
+ if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
ret = -EINVAL;
- }
out:
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, FREE_SPACE_TREE) &&
@@ -897,10 +1138,14 @@ out:
ret = -EINVAL;
}
- if (!ret && btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
+ if (!ret)
+ ret = btrfs_check_mountopts_zoned(info);
+ if (!ret && !remounting) {
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ btrfs_info(info, "using free space tree");
+ }
return ret;
}
@@ -1011,9 +1256,6 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
*subvol_objectid = subvolid;
break;
- case Opt_subvolrootid:
- pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
- break;
default:
break;
}
@@ -1024,11 +1266,11 @@ out:
return error;
}
-static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
- u64 subvol_objectid)
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *fs_root;
+ struct btrfs_root *fs_root = NULL;
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
@@ -1043,7 +1285,6 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto err;
}
- path->leave_spinning = 1;
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
@@ -1062,21 +1303,14 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
- ret = btrfs_previous_item(root, path, subvol_objectid,
- BTRFS_ROOT_BACKREF_KEY);
- if (ret < 0) {
- goto err;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto err;
- }
+ ret = -ENOENT;
+ goto err;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
subvol_objectid = key.offset;
root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1093,12 +1327,10 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
btrfs_release_path(path);
- key.objectid = subvol_objectid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
+ fs_root = NULL;
goto err;
}
@@ -1111,21 +1343,14 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(fs_root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
- ret = btrfs_previous_item(fs_root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0) {
- goto err;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto err;
- }
+ ret = -ENOENT;
+ goto err;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
dirid = key.offset;
inode_ref = btrfs_item_ptr(path->nodes[0],
@@ -1143,6 +1368,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
ptr[0] = '/';
btrfs_release_path(path);
}
+ btrfs_put_root(fs_root);
+ fs_root = NULL;
}
btrfs_free_path(path);
@@ -1155,6 +1382,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
return name;
err:
+ btrfs_put_root(fs_root);
btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
@@ -1171,7 +1399,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
/*
* Find the "default" dir item which points to the root item that we
@@ -1207,7 +1434,6 @@ static int btrfs_fill_super(struct super_block *sb,
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_key key;
int err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1215,6 +1441,9 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_op = &btrfs_super_ops;
sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &btrfs_verityops;
+#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -1235,10 +1464,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(sb, &key, fs_info->fs_root);
+ inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -1250,7 +1476,6 @@ static int btrfs_fill_super(struct super_block *sb,
goto fail_close;
}
- cleancache_init_fs(sb);
sb->s_flags |= SB_ACTIVE;
return 0;
@@ -1302,10 +1527,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return btrfs_commit_transaction(trans);
}
+static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
+{
+ seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
+ *printed = true;
+}
+
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
+ bool printed = false;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1338,7 +1571,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
- seq_puts(seq, ",nologreplay");
+ print_rescue_option(seq, "nologreplay", &printed);
+ if (btrfs_test_opt(info, USEBACKUPROOT))
+ print_rescue_option(seq, "usebackuproot", &printed);
+ if (btrfs_test_opt(info, IGNOREBADROOTS))
+ print_rescue_option(seq, "ignorebadroots", &printed);
+ if (btrfs_test_opt(info, IGNOREDATACSUMS))
+ print_rescue_option(seq, "ignoredatacsums", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1347,9 +1586,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
- if (btrfs_test_opt(info, SPACE_CACHE))
+ if (btrfs_free_space_cache_v1_active(info))
seq_puts(seq, ",space_cache");
- else if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
@@ -1363,12 +1602,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",enospc_debug");
if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
- if (btrfs_test_opt(info, INODE_MAP_CACHE))
- seq_puts(seq, ",inode_cache");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+ if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
seq_puts(seq, ",check_int_data");
else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
@@ -1392,8 +1629,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
@@ -1438,8 +1680,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
goto out;
}
}
- subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
- subvol_objectid);
+ subvol_name = btrfs_get_subvol_name_from_objectid(
+ btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
@@ -1518,14 +1760,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
- * it for searching for existing supers, so this lets us do that and
- * then open_ctree will properly initialize everything later.
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
*/
fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
}
+ btrfs_init_fs_info(fs_info);
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
@@ -1561,7 +1806,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_close_devices;
}
- bdev = fs_devices->latest_bdev;
+ bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
@@ -1571,11 +1816,13 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
if (s->s_root) {
btrfs_close_devices(fs_devices);
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+ s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
@@ -1594,7 +1841,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
@@ -1689,23 +1936,12 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
- new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
- new_pool_size);
-}
-
-static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
@@ -1725,6 +1961,8 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+
/*
* We need to cleanup all defragable inodes if the autodefragment is
* close or the filesystem is read only.
@@ -1742,13 +1980,14 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
- clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ /* If we toggled space cache */
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *root = fs_info->tree_root;
unsigned old_flags = sb->s_flags;
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
@@ -1758,7 +1997,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
int ret;
sync_filesystem(sb);
- btrfs_remount_prepare(fs_info);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
if (data) {
void *new_sec_opts = NULL;
@@ -1775,10 +2014,30 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0)
+ goto restore;
+
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from ro to rw");
+ /* Make sure free space cache options match the state on disk */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ }
+
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
@@ -1788,6 +2047,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
@@ -1796,7 +2056,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
/* avoid complains from lockdep et al. */
up(&fs_info->uuid_tree_rescan_sem);
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
/*
* Setting SB_RDONLY will put the cleaner thread to
@@ -1807,15 +2067,47 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
*/
btrfs_delete_unused_bgs(fs_info);
+ /*
+ * The cleaner task could be already running before we set the
+ * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
+ * We must make sure that after we finish the remount, i.e. after
+ * we call btrfs_commit_super(), the cleaner can no longer start
+ * a transaction - either because it was dropping a dead root,
+ * running delayed iputs or deleting an unused block group (the
+ * cleaner picked a block group from the list of unused block
+ * groups before we were able to in the previous call to
+ * btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
+ TASK_UNINTERRUPTIBLE);
+
+ /*
+ * We've set the superblock to RO mode, so we might have made
+ * the cleaner task sleep without running all pending delayed
+ * iputs. Go through all the delayed iputs here, so that if an
+ * unmount happens without remounting RW we don't end up at
+ * finishing close_ctree() with a non-empty list of delayed
+ * iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want
+ * it to be still running after we are in RO mode, as after that,
+ * by the time we unmount, it might have left a transaction open,
+ * so we would leak the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
ret = btrfs_commit_super(fs_info);
if (ret)
goto restore;
} else {
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
@@ -1840,52 +2132,39 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto restore;
-
- /* recover relocation */
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret)
- goto restore;
-
- ret = btrfs_resume_balance_async(fs_info);
+ /*
+ * NOTE: when remounting with a change that does writes, don't
+ * put it anywhere above this point, as we are not sure to be
+ * safe to write until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
goto restore;
- ret = btrfs_resume_dev_replace_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume dev_replace");
- goto restore;
- }
-
- btrfs_qgroup_rescan_resume(fs_info);
-
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree %d",
- ret);
- goto restore;
- }
- }
- sb->s_flags &= ~SB_RDONLY;
+ btrfs_clear_sb_rdonly(sb);
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
out:
+ /*
+ * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
+ * since the absence of the flag means it can be toggled off by remount.
+ */
+ *flags |= SB_I_VERSION;
+
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_clear_oneshot_options(fs_info);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return 0;
restore:
/* We've hit an error - don't reset SB_RDONLY */
if (sb_rdonly(sb))
old_flags |= SB_RDONLY;
+ if (!(old_flags & SB_RDONLY))
+ clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
sb->s_flags = old_flags;
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
@@ -1894,20 +2173,21 @@ restore:
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
btrfs_remount_cleanup(fs_info, old_opts);
+ clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
return ret;
}
/* Used to sort the devices by max_avail(descending sort) */
-static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
- const void *dev_info2)
+static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
{
- if (((struct btrfs_device_info *)dev_info1)->max_avail >
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ const struct btrfs_device_info *dev_info1 = a;
+ const struct btrfs_device_info *dev_info2 = b;
+
+ if (dev_info1->max_avail > dev_info2->max_avail)
return -1;
- else if (((struct btrfs_device_info *)dev_info1)->max_avail <
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ else if (dev_info1->max_avail < dev_info2->max_avail)
return 1;
- else
return 0;
}
@@ -1966,12 +2246,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
- num_stripes = 2;
- else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
- num_stripes = 3;
- else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
- num_stripes = 4;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK)
+ num_stripes = rattr->ncopies;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
@@ -1995,17 +2271,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
- * In order to avoid overwriting the superblock on the drive,
- * btrfs starts at an offset of at least 1MB when doing chunk
- * allocation.
- *
- * This ensures we have at least min_stripe_size free space
- * after excluding 1MB.
+ * Ensure we have at least min_stripe_size on top of the
+ * reserved space on the device.
*/
- if (avail_space <= SZ_1M + min_stripe_size)
+ if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size)
continue;
- avail_space -= SZ_1M;
+ avail_space -= BTRFS_DEVICE_RANGE_RESERVED;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
@@ -2062,7 +2334,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_used = 0;
u64 total_free_data = 0;
u64 total_free_meta = 0;
- int bits = dentry->d_sb->s_blocksize_bits;
+ u32 bits = fs_info->sectorsize_bits;
__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
@@ -2070,8 +2342,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, &fs_info->space_info, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2100,8 +2371,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
@@ -2170,7 +2439,7 @@ static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
kill_anon_super(sb);
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
}
static struct file_system_type btrfs_fs_type = {
@@ -2186,7 +2455,7 @@ static struct file_system_type btrfs_root_fs_type = {
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("btrfs");
@@ -2203,13 +2472,14 @@ static int btrfs_control_open(struct inode *inode, struct file *file)
}
/*
- * used by btrfsctl to scan devices when no FS is mounted
+ * Used by /dev/btrfs-control for devices ioctls.
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct btrfs_ioctl_vol_args *vol;
struct btrfs_device *device = NULL;
+ dev_t devt = 0;
int ret = -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
@@ -2229,7 +2499,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
- ret = btrfs_forget_devices(vol->name);
+ if (vol->name[0] != 0) {
+ ret = lookup_bdev(vol->name, &devt);
+ if (ret)
+ break;
+ }
+ ret = btrfs_forget_devices(devt);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
@@ -2276,48 +2551,103 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans);
}
+static int check_dev_super(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_super_block *sb;
+ u16 csum_type;
+ int ret = 0;
+
+ /* This should be called with fs still frozen. */
+ ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+ /* Missing dev, no need to check. */
+ if (!dev->bdev)
+ return 0;
+
+ /* Only need to check the primary super block. */
+ sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Verify the checksum. */
+ csum_type = btrfs_super_csum_type(sb);
+ if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
+ btrfs_err(fs_info, "csum type changed, has %u expect %u",
+ csum_type, btrfs_super_csum_type(fs_info->super_copy));
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_check_super_csum(fs_info, sb)) {
+ btrfs_err(fs_info, "csum for on-disk super block no longer matches");
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /* Btrfs_validate_super() includes fsid check against super->fsid. */
+ ret = btrfs_validate_super(fs_info, sb, 0);
+ if (ret < 0)
+ goto out;
+
+ if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+ btrfs_super_generation(sb),
+ fs_info->last_trans_committed);
+ ret = -EUCLEAN;
+ goto out;
+ }
+out:
+ btrfs_release_disk_super(sb);
+ return ret;
+}
+
static int btrfs_unfreeze(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ int ret = 0;
+ /*
+ * Make sure the fs is not changed by accident (like hibernation then
+ * modified by other OS).
+ * If we found anything wrong, we mark the fs error immediately.
+ *
+ * And since the fs is frozen, no one can modify the fs yet, thus
+ * we don't need to hold device_list_mutex.
+ */
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ ret = check_dev_super(device);
+ if (ret < 0) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "super block on devid %llu got modified unexpectedly",
+ device->devid);
+ break;
+ }
+ }
clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+ /*
+ * We still return 0, to allow VFS layer to unfreeze the fs even the
+ * above checks failed. Since the fs is either fine or read-only, we're
+ * safe to continue, without causing further damage.
+ */
return 0;
}
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_fs_devices *cur_devices;
- struct btrfs_device *dev, *first_dev = NULL;
- struct list_head *head;
/*
- * Lightweight locking of the devices. We should not need
- * device_list_mutex here as we only read the device data and the list
- * is protected by RCU. Even if a device is deleted during the list
- * traversals, we'll get valid data, the freeing callback will wait at
- * least until the rcu_read_unlock.
+ * There should be always a valid pointer in latest_dev, it may be stale
+ * for a short moment in case it's being deleted but still valid until
+ * the end of RCU grace period.
*/
rcu_read_lock();
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
- head = &cur_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
- cur_devices = cur_devices->seed;
- }
-
- if (first_dev)
- seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
- else
- WARN_ON(1);
+ seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
rcu_read_unlock();
+
return 0;
}
@@ -2379,6 +2709,16 @@ static void __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ", zoned=yes"
+#else
+ ", zoned=no"
+#endif
+#ifdef CONFIG_FS_VERITY
+ ", fsverity=yes"
+#else
+ ", fsverity=no"
+#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
@@ -2399,17 +2739,21 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_compress;
- err = extent_io_init();
+ err = extent_state_init_cachep();
if (err)
goto free_cachep;
- err = extent_state_cache_init();
+ err = extent_buffer_init_cachep();
+ if (err)
+ goto free_extent_cachep;
+
+ err = btrfs_bioset_init();
if (err)
- goto free_extent_io;
+ goto free_eb_cachep;
err = extent_map_init();
if (err)
- goto free_extent_state_cache;
+ goto free_bioset;
err = ordered_data_init();
if (err)
@@ -2431,15 +2775,9 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_ref;
- err = btrfs_end_io_wq_init();
- if (err)
- goto free_prelim_ref;
-
err = btrfs_interface_init();
if (err)
- goto free_end_io_wq;
-
- btrfs_init_lockdep();
+ goto free_prelim_ref;
btrfs_print_mod_info();
@@ -2455,8 +2793,6 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
-free_end_io_wq:
- btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
@@ -2469,10 +2805,12 @@ free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
-free_extent_state_cache:
- extent_state_cache_exit();
-free_extent_io:
- extent_io_exit();
+free_bioset:
+ btrfs_bioset_exit();
+free_eb_cachep:
+ extent_buffer_free_cachep();
+free_extent_cachep:
+ extent_state_free_cachep();
free_cachep:
btrfs_destroy_cachep();
free_compress:
@@ -2491,10 +2829,10 @@ static void __exit exit_btrfs_fs(void)
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
- extent_state_cache_exit();
- extent_io_exit();
+ btrfs_bioset_exit();
+ extent_state_free_cachep();
+ extent_buffer_free_cachep();
btrfs_interface_exit();
- btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3c10e78924d0..699b54b3acaa 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -14,11 +14,34 @@
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
#include "space-info.h"
#include "block-group.h"
+#include "qgroup.h"
+#include "misc.h"
+
+/*
+ * Structure name Path
+ * --------------------------------------------------------------------------
+ * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
+ * btrfs_supported_feature_attrs /sys/fs/btrfs/features and
+ * /sys/fs/btrfs/<uuid>/features
+ * btrfs_attrs /sys/fs/btrfs/<uuid>
+ * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid>
+ * allocation_attrs /sys/fs/btrfs/<uuid>/allocation
+ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
+ * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
+ * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ * discard_attrs /sys/fs/btrfs/<uuid>/discard
+ *
+ * When built with BTRFS_CONFIG_DEBUG:
+ *
+ * btrfs_debug_feature_attrs /sys/fs/btrfs/debug
+ * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
+ */
struct btrfs_feature_attr {
struct kobj_attribute kobj_attr;
@@ -39,6 +62,10 @@ struct raid_kobject {
.store = _store, \
}
+#define BTRFS_ATTR_W(_prefix, _name, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
@@ -70,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static struct kobject *get_btrfs_kobj(struct kobject *kobj);
static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
{
@@ -155,7 +183,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -248,27 +276,41 @@ static umode_t btrfs_feature_visible(struct kobject *kobj,
return mode;
}
-BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
-BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+#ifdef CONFIG_BLK_DEV_ZONED
+BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
+/* Remove once support for extent tree v2 is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+#endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features - all available features implemented by this version
+ * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
+ * can be changed on a mounted filesystem.
+ */
static struct attribute *btrfs_supported_feature_attrs[] = {
- BTRFS_FEAT_ATTR_PTR(mixed_backref),
BTRFS_FEAT_ATTR_PTR(default_subvol),
BTRFS_FEAT_ATTR_PTR(mixed_groups),
BTRFS_FEAT_ATTR_PTR(compress_lzo),
BTRFS_FEAT_ATTR_PTR(compress_zstd),
- BTRFS_FEAT_ATTR_PTR(big_metadata),
BTRFS_FEAT_ATTR_PTR(extended_iref),
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
@@ -276,16 +318,19 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
+ BTRFS_FEAT_ATTR_PTR(block_group_tree),
+#ifdef CONFIG_BLK_DEV_ZONED
+ BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+#endif
+#ifdef CONFIG_FS_VERITY
+ BTRFS_FEAT_ATTR_PTR(verity),
+#endif
NULL
};
-/*
- * Features which depend on feature bits and may differ between each fs.
- *
- * /sys/fs/btrfs/features lists all available features of this kernel while
- * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
- * can be changed online.
- */
static const struct attribute_group btrfs_feature_attr_group = {
.name = "features",
.is_visible = btrfs_feature_visible,
@@ -295,7 +340,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "0\n");
+ return sysfs_emit(buf, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@@ -310,39 +355,86 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
- ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+ btrfs_super_csum_name(i));
}
- ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
-static struct attribute *btrfs_supported_static_feature_attrs[] = {
- BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
- BTRFS_ATTR_PTR(static_feature, supported_checksums),
- NULL
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
+static const char *rescue_opts[] = {
+ "usebackuproot",
+ "nologreplay",
+ "ignorebadroots",
+ "ignoredatacsums",
+ "all",
};
+static ssize_t supported_rescue_options_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+ ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+ ret += sysfs_emit_at(buf, ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_rescue_options,
+ supported_rescue_options_show);
+
+static ssize_t supported_sectorsizes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (PAGE_SIZE > SZ_4K)
+ ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
+ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
+
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_sectorsizes,
+ supported_sectorsizes_show);
+
/*
* Features which only depend on kernel version.
*
* These are listed in /sys/fs/btrfs/features along with
- * btrfs_feature_attr_group
+ * btrfs_supported_feature_attrs.
*/
+static struct attribute *btrfs_supported_static_feature_attrs[] = {
+ BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
+ BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
+ BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+ NULL
+};
+
static const struct attribute_group btrfs_static_feature_attr_group = {
.name = "features",
.attrs = btrfs_supported_static_feature_attrs,
};
-#ifdef CONFIG_BTRFS_DEBUG
-
/*
* Discard statistics and tunables
*/
-#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj))
static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -350,7 +442,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -361,7 +453,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return sysfs_emit(buf, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -372,8 +464,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
- fs_info->discard_ctl.discard_bitmap_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -383,7 +475,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return sysfs_emit(buf, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -394,8 +486,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
- fs_info->discard_ctl.discard_extent_bytes);
+ return sysfs_emit(buf, "%llu\n",
+ fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -405,8 +497,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.iops_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
}
static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -423,7 +515,8 @@ static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
return -EINVAL;
WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
-
+ btrfs_discard_calc_delay(discard_ctl);
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
@@ -435,8 +528,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- READ_ONCE(fs_info->discard_ctl.kbps_limit));
+ return sysfs_emit(buf, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -453,7 +546,7 @@ static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
return -EINVAL;
WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
-
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
@@ -465,8 +558,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- READ_ONCE(fs_info->discard_ctl.max_discard_size));
+ return sysfs_emit(buf, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -489,7 +582,12 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
-static const struct attribute *discard_debug_attrs[] = {
+/*
+ * Per-filesystem stats for discard (when mounted with discard=async).
+ *
+ * Path: /sys/fs/btrfs/<uuid>/discard/
+ */
+static const struct attribute *discard_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
@@ -501,16 +599,22 @@ static const struct attribute *discard_debug_attrs[] = {
NULL,
};
+#ifdef CONFIG_BTRFS_DEBUG
+
/*
- * Runtime debugging exported via sysfs
+ * Per-filesystem runtime debugging exported via sysfs.
*
- * /sys/fs/btrfs/debug - applies to module or all filesystems
- * /sys/fs/btrfs/UUID - applies only to the given filesystem
+ * Path: /sys/fs/btrfs/UUID/debug/
*/
static const struct attribute *btrfs_debug_mount_attrs[] = {
NULL,
};
+/*
+ * Runtime debugging exported via sysfs, applies to all mounted filesystems.
+ *
+ * Path: /sys/fs/btrfs/debug
+ */
static struct attribute *btrfs_debug_feature_attrs[] = {
NULL
};
@@ -530,7 +634,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -576,9 +680,14 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return sysfs_emit(buf, "%llu\n", val);
}
+/*
+ * Allocation information about block group profiles.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
+ */
static struct attribute *raid_attrs[] = {
BTRFS_ATTR_PTR(raid, total_bytes),
BTRFS_ATTR_PTR(raid, used_bytes),
@@ -607,15 +716,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
-static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
- struct kobj_attribute *a,
- char *buf)
+static ssize_t btrfs_chunk_size_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
- s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
- return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size));
}
+/*
+ * Store new chunk size in space info. Can be called on a read-only filesystem.
+ *
+ * If the new chunk size value is larger than 10% of free space it is reduced
+ * to match that limit. Alignment must be to 256M and the system chunk size
+ * cannot be set.
+ */
+static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ char *retptr;
+ u64 val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!fs_info->fs_devices)
+ return -EINVAL;
+
+ if (btrfs_is_zoned(fs_info))
+ return -EINVAL;
+
+ /* System block type must not be changed. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -EPERM;
+
+ val = memparse(buf, &retptr);
+ /* There could be trailing '\n', also catch any typos after the value */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || val == 0)
+ return -EINVAL;
+
+ val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
+
+ /* Limit stripe size to 10% of available space. */
+ val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+
+ /* Must be multiple of 256M. */
+ val &= ~((u64)SZ_256M - 1);
+
+ /* Must be at least 256M. */
+ if (val < SZ_256M)
+ return -EINVAL;
+
+ btrfs_update_space_info_chunk_size(space_info, val);
+
+ return len;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Request chunk allocation with current chunk size.
+ */
+static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ struct btrfs_trans_handle *trans;
+ bool val;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return -EINVAL;
+
+ /*
+ * This is unsafe to be called from sysfs context and may cause
+ * unexpected problems.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_force_chunk_alloc(trans, space_info->flags);
+ btrfs_end_transaction(trans);
+
+ if (ret == 1)
+ return len;
+
+ return -ENOSPC;
+}
+BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store);
+
+#endif
+
SPACE_INFO_ATTR(flags);
SPACE_INFO_ATTR(total_bytes);
SPACE_INFO_ATTR(bytes_used);
@@ -623,11 +829,49 @@ SPACE_INFO_ATTR(bytes_pinned);
SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(bytes_readonly);
+SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(space_info, total_bytes_pinned,
- btrfs_space_info_show_total_bytes_pinned);
+BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+}
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int thresh;
+ int ret;
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh < 0 || thresh > 100)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
+ btrfs_sinfo_bg_reclaim_threshold_show,
+ btrfs_sinfo_bg_reclaim_threshold_store);
+
+/*
+ * Allocation information about block group types.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
+ */
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, flags),
BTRFS_ATTR_PTR(space_info, total_bytes),
@@ -636,9 +880,14 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_reserved),
BTRFS_ATTR_PTR(space_info, bytes_may_use),
BTRFS_ATTR_PTR(space_info, bytes_readonly),
+ BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
- BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
+ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, chunk_size),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -646,7 +895,6 @@ ATTRIBUTE_GROUPS(space_info);
static void space_info_release(struct kobject *kobj)
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
- percpu_counter_destroy(&sinfo->total_bytes_pinned);
kfree(sinfo);
}
@@ -656,6 +904,11 @@ static struct kobj_type space_info_ktype = {
.default_groups = space_info_groups,
};
+/*
+ * Allocation information about block groups.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/
+ */
static const struct attribute *allocation_attrs[] = {
BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
BTRFS_ATTR_PTR(allocation, global_rsv_size),
@@ -670,7 +923,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -718,7 +971,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -728,19 +981,59 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
+static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf,
+ "commits %llu\n"
+ "last_commit_ms %llu\n"
+ "max_commit_ms %llu\n"
+ "total_commit_ms %llu\n",
+ fs_info->commit_stats.commit_count,
+ div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
+}
+
+static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long val;
+ int ret;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return ret;
+ if (val)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0);
+
+ return len;
+}
+BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -752,7 +1045,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return sysfs_emit(buf, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -790,8 +1083,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%pU\n",
- fs_info->fs_devices->metadata_uuid);
+ return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
}
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -802,13 +1094,145 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
- return snprintf(buf, PAGE_SIZE, "%s (%s)\n",
- btrfs_super_csum_name(csum_type),
- crypto_shash_driver_name(fs_info->csum_shash));
+ return sysfs_emit(buf, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
}
BTRFS_ATTR(, checksum, btrfs_checksum_show);
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ str = "balance paused\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return sysfs_emit(buf, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
+static ssize_t btrfs_generation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n", fs_info->generation);
+}
+BTRFS_ATTR(, generation, btrfs_generation_show);
+
+static const char * const btrfs_read_policy_name[] = { "pid" };
+
+static ssize_t btrfs_read_policy_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (fs_devices->read_policy == i)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ else
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+
+static ssize_t btrfs_read_policy_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
+ if (i != fs_devices->read_policy) {
+ fs_devices->read_policy = i;
+ btrfs_info(fs_devices->fs_info,
+ "read policy set to '%s'",
+ btrfs_read_policy_name[i]);
+ }
+ return len;
+ }
+ }
+
+ return -EINVAL;
+}
+BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+
+static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
+}
+
+static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh != 0 && (thresh <= 50 || thresh > 100))
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
+ btrfs_bg_reclaim_threshold_store);
+
+/*
+ * Per-filesystem information and stats.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/
+ */
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -817,6 +1241,11 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
+ BTRFS_ATTR_PTR(, generation),
+ BTRFS_ATTR_PTR(, read_policy),
+ BTRFS_ATTR_PTR(, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(, commit_stats),
NULL,
};
@@ -847,11 +1276,26 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
return to_fs_devs(kobj)->fs_info;
}
+static struct kobject *get_btrfs_kobj(struct kobject *kobj)
+{
+ while (kobj) {
+ if (kobj->ktype == &btrfs_ktype)
+ return kobj;
+ kobj = kobj->parent;
+ }
+ return NULL;
+}
+
#define NUM_FEATURE_BITS 64
#define BTRFS_FEATURE_NAME_MAX 13
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
+ ARRAY_SIZE(btrfs_feature_attrs));
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
static const u64 supported_feature_masks[FEAT_MAX] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
@@ -935,22 +1379,37 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
- btrfs_reset_fs_info_ptr(fs_info);
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+
+ sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
-#ifdef CONFIG_BTRFS_DEBUG
- if (fs_info->discard_debug_kobj) {
- sysfs_remove_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
- kobject_del(fs_info->discard_debug_kobj);
- kobject_put(fs_info->discard_debug_kobj);
+ if (fs_info->discard_kobj) {
+ sysfs_remove_files(fs_info->discard_kobj, discard_attrs);
+ kobject_del(fs_info->discard_kobj);
+ kobject_put(fs_info->discard_kobj);
}
+#ifdef CONFIG_BTRFS_DEBUG
if (fs_info->debug_kobj) {
sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
kobject_del(fs_info->debug_kobj);
@@ -958,9 +1417,9 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
}
#endif
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -969,7 +1428,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -992,7 +1451,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
continue;
name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
- len += snprintf(str + len, bufsize - len, "%s%s",
+ len += scnprintf(str + len, bufsize - len, "%s%s",
len ? "," : "", name);
}
@@ -1004,11 +1463,6 @@ static void init_feature_attrs(void)
struct btrfs_feature_attr *fa;
int set, i;
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
- ARRAY_SIZE(btrfs_feature_attrs));
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
- ARRAY_SIZE(btrfs_feature_attrs[0]));
-
memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
memset(btrfs_unknown_feature_names, 0,
sizeof(btrfs_unknown_feature_names));
@@ -1075,17 +1529,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -1124,7 +1599,7 @@ static const char *alloc_name(u64 flags)
default:
WARN_ON(1);
return "invalid-combination";
- };
+ }
}
/*
@@ -1147,48 +1622,25 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->devices_kobj)
- return -EINVAL;
-
- if (one_device) {
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
-
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
-
- return 0;
- }
-
- list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
+ if (device->bdev)
+ sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
- wait_for_completion(&one_device->kobj_unregister);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
@@ -1201,11 +1653,11 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
-static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
+static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
int val;
@@ -1214,9 +1666,9 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
-BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show);
+BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -1228,10 +1680,36 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
+}
+
+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+ char *endptr;
+ unsigned long long limit;
+
+ limit = memparse(buf, &endptr);
+ WRITE_ONCE(device->scrub_speed_max, limit);
+ return len;
+}
+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
+ btrfs_devinfo_scrub_speed_max_store);
+
static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1241,14 +1719,60 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+static ssize_t btrfs_devinfo_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return sysfs_emit(buf, "%pU\n", device->fs_devices->fsid);
+}
+BTRFS_ATTR(devid, fsid, btrfs_devinfo_fsid_show);
+
+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ if (!device->dev_stats_valid)
+ return sysfs_emit(buf, "invalid\n");
+
+ /*
+ * Print all at once so we get a snapshot of all values from the same
+ * time. Keep them in sync and in order of definition of
+ * btrfs_dev_stat_values.
+ */
+ return sysfs_emit(buf,
+ "write_errs %d\n"
+ "read_errs %d\n"
+ "flush_errs %d\n"
+ "corruption_errs %d\n"
+ "generation_errs %d\n",
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+
+/*
+ * Information about one device.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
+ */
static struct attribute *devid_attrs[] = {
+ BTRFS_ATTR_PTR(devid, error_stats),
+ BTRFS_ATTR_PTR(devid, fsid),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
+ BTRFS_ATTR_PTR(devid, scrub_speed_max),
BTRFS_ATTR_PTR(devid, writeable),
NULL
};
@@ -1269,41 +1793,76 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
-
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ int ret;
+ unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
- if (one_device && one_device != dev)
- continue;
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
- if (dev->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ nofs_flag = memalloc_nofs_save();
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ if (device->bdev) {
+ struct kobject *disk_kobj = bdev_kobj(device->bdev);
- error = sysfs_create_link(fs_devices->devices_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
}
+ }
- init_completion(&dev->kobj_unregister);
- error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devinfo_kobj, "%llu",
- dev->devid);
- if (error) {
- kobject_put(&dev->devid_kobj);
- break;
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
+ }
+
+out:
+ memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
+
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
}
}
- return error;
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1317,8 +1876,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1326,7 +1885,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
@@ -1371,7 +1930,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
if (!fs_devs->devices_kobj) {
btrfs_err(fs_devs->fs_info,
"failed to init sysfs device interface");
- kobject_put(&fs_devs->fsid_kobj);
+ btrfs_sysfs_remove_fsid(fs_devs);
return -ENOMEM;
}
@@ -1393,15 +1952,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_device_link(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_rm_device_link(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1420,25 +1977,27 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
if (error)
goto failure;
+#endif
/* Discard directory */
- fs_info->discard_debug_kobj = kobject_create_and_add("discard",
- fs_info->debug_kobj);
- if (!fs_info->discard_debug_kobj) {
+ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
+ if (!fs_info->discard_kobj) {
error = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
+ error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
if (error)
goto failure;
-#endif
error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
goto failure;
+ error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (error)
+ goto failure;
+
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
@@ -1456,6 +2015,256 @@ failure:
return error;
}
+static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool enabled;
+
+ spin_lock(&fs_info->qgroup_lock);
+ enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", enabled);
+}
+BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+
+static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool inconsistent;
+
+ spin_lock(&fs_info->qgroup_lock);
+ inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", inconsistent);
+}
+BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show);
+
+static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 result;
+
+ spin_lock(&fs_info->qgroup_lock);
+ result = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 new_thres;
+ int ret;
+
+ ret = kstrtou8(buf, 10, &new_thres);
+ if (ret)
+ return -EINVAL;
+
+ if (new_thres > BTRFS_MAX_LEVEL)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->qgroup_drop_subtree_thres = new_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return len;
+}
+BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show,
+ qgroup_drop_subtree_thres_store);
+
+/*
+ * Qgroups global info
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/
+ */
+static struct attribute *qgroups_attrs[] = {
+ BTRFS_ATTR_PTR(qgroups, enabled),
+ BTRFS_ATTR_PTR(qgroups, inconsistent),
+ BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroups);
+
+static void qgroups_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static struct kobj_type qgroups_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = qgroups_groups,
+ .release = qgroups_release,
+};
+
+static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
+{
+ return to_fs_info(kobj->parent->parent);
+}
+
+#define QGROUP_ATTR(_member, _show_name) \
+static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member)
+
+#define QGROUP_RSV_ATTR(_name, _type) \
+static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \
+ struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \
+ struct btrfs_qgroup, kobj); \
+ return btrfs_show_u64(&qgroup->rsv.values[_type], \
+ &fs_info->qgroup_lock, buf); \
+} \
+BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
+
+QGROUP_ATTR(rfer, referenced);
+QGROUP_ATTR(excl, exclusive);
+QGROUP_ATTR(max_rfer, max_referenced);
+QGROUP_ATTR(max_excl, max_exclusive);
+QGROUP_ATTR(lim_flags, limit_flags);
+QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
+QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
+QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+
+/*
+ * Qgroup information.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
+ */
+static struct attribute *qgroup_attrs[] = {
+ BTRFS_ATTR_PTR(qgroup, referenced),
+ BTRFS_ATTR_PTR(qgroup, exclusive),
+ BTRFS_ATTR_PTR(qgroup, max_referenced),
+ BTRFS_ATTR_PTR(qgroup, max_exclusive),
+ BTRFS_ATTR_PTR(qgroup, limit_flags),
+ BTRFS_ATTR_PTR(qgroup, rsv_data),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
+ BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroup);
+
+static void qgroup_release(struct kobject *kobj)
+{
+ struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj);
+
+ memset(&qgroup->kobj, 0, sizeof(*kobj));
+}
+
+static struct kobj_type qgroup_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = qgroup_release,
+ .default_groups = qgroup_groups,
+};
+
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ struct kobject *qgroups_kobj = fs_info->qgroups_kobj;
+ int ret;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+ if (qgroup->kobj.state_initialized)
+ return 0;
+ if (!qgroups_kobj)
+ return -EINVAL;
+
+ ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
+ "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid));
+ if (ret < 0)
+ kobject_put(&qgroup->kobj);
+
+ return ret;
+}
+
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node)
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ if (fs_info->qgroups_kobj) {
+ kobject_del(fs_info->qgroups_kobj);
+ kobject_put(fs_info->qgroups_kobj);
+ fs_info->qgroups_kobj = NULL;
+ }
+}
+
+/* Called when qgroups get initialized, thus there is no need for locking */
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
+{
+ struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *next;
+ int ret = 0;
+
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return 0;
+
+ ASSERT(fsid_kobj);
+ if (fs_info->qgroups_kobj)
+ return 0;
+
+ fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!fs_info->qgroups_kobj)
+ return -ENOMEM;
+
+ ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype,
+ fsid_kobj, "qgroups");
+ if (ret < 0)
+ goto out;
+
+ rbtree_postorder_for_each_entry_safe(qgroup, next,
+ &fs_info->qgroup_tree, node) {
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ if (ret < 0)
+ btrfs_sysfs_del_qgroups(fs_info);
+ return ret;
+}
+
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup)
+{
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))
+ return;
+
+ if (qgroup->kobj.state_initialized) {
+ kobject_del(&qgroup->kobj);
+ kobject_put(&qgroup->kobj);
+ }
+}
/*
* Change per-fs features in /sys/fs/btrfs/UUID/features to match current
@@ -1466,12 +2275,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
@@ -1532,4 +2345,3 @@ void __cold btrfs_exit_sysfs(void)
#endif
kset_unregister(btrfs_kset);
}
-
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index c68582add92e..bacef43f7267 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,15 +13,12 @@ enum btrfs_feature_set {
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
@@ -36,4 +33,11 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup);
+
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 84fb3fa940a6..d43cb5242fec 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -55,8 +55,15 @@ struct inode *btrfs_new_test_inode(void)
struct inode *inode;
inode = new_inode(test_mnt->mnt_sb);
- if (inode)
- inode_init_owner(inode, NULL, S_IFREG);
+ if (!inode)
+ return NULL;
+
+ inode->i_mode = S_IFREG;
+ inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+ inode_init_owner(&init_user_ns, inode, NULL, S_IFREG);
return inode;
}
@@ -120,6 +127,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
kfree(fs_info);
return NULL;
}
+ INIT_LIST_HEAD(&fs_info->fs_devices->devices);
+
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
GFP_KERNEL);
if (!fs_info->super_copy) {
@@ -128,39 +137,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
return NULL;
}
+ btrfs_init_fs_info(fs_info);
+
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
-
- if (init_srcu_struct(&fs_info->subvol_srcu)) {
- kfree(fs_info->fs_devices);
- kfree(fs_info->super_copy);
- kfree(fs_info);
- return NULL;
- }
-
- spin_lock_init(&fs_info->buffer_lock);
- spin_lock_init(&fs_info->qgroup_lock);
- spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->fs_roots_radix_lock);
- mutex_init(&fs_info->qgroup_ioctl_lock);
- mutex_init(&fs_info->qgroup_rescan_lock);
- rwlock_init(&fs_info->tree_mod_log_lock);
- fs_info->running_transaction = NULL;
- fs_info->qgroup_tree = RB_ROOT;
- fs_info->qgroup_ulist = NULL;
- atomic64_set(&fs_info->tree_mod_seq, 0);
- INIT_LIST_HEAD(&fs_info->dirty_qgroups);
- INIT_LIST_HEAD(&fs_info->dead_roots);
- INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
- INIT_LIST_HEAD(&fs_info->fs_devices->devices);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
- IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
- IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
- extent_map_tree_init(&fs_info->mapping_tree);
- fs_info->pinned_extents = &fs_info->freed_extents[0];
+ fs_info->sectorsize_bits = ilog2(sectorsize);
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -210,24 +191,22 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
- cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->super_copy);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->fs_devices);
kfree(fs_info);
}
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
- if (root->node) {
- /* One for allocate_extent_buffer */
- free_extent_buffer(root->node);
- }
- kfree(root);
+ btrfs_global_root_delete(root);
+ btrfs_put_root(root);
}
struct btrfs_block_group *
@@ -254,7 +233,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
- btrfs_init_free_space_ctl(cache);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
mutex_init(&cache->free_space_lock);
return cache;
@@ -264,7 +243,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
kfree(cache->free_space_ctl);
kfree(cache);
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b5978e..b7d181a08eab 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -15,7 +15,6 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
struct btrfs_path *path = NULL;
struct btrfs_root *root = NULL;
struct extent_buffer *eb;
- struct btrfs_item *item;
char *value = "mary had a little lamb";
char *split1 = "mary had a little";
char *split2 = " lamb";
@@ -48,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ path->nodes[0] = eb;
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
@@ -60,9 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
- item = btrfs_item_nr(0);
+ btrfs_setup_item_for_insert(root, path, &key, value_len);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
@@ -91,8 +89,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(0);
- if (btrfs_item_size(eb, item) != strlen(split1)) {
+ if (btrfs_item_size(eb, 0) != strlen(split1)) {
test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
@@ -116,8 +113,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(1);
- if (btrfs_item_size(eb, item) != strlen(split2)) {
+ if (btrfs_item_size(eb, 1) != strlen(split2)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
@@ -148,8 +144,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(0);
- if (btrfs_item_size(eb, item) != strlen(split3)) {
+ if (btrfs_item_size(eb, 0) != strlen(split3)) {
test_err("invalid len in the first split");
ret = -EINVAL;
goto out;
@@ -172,8 +167,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(1);
- if (btrfs_item_size(eb, item) != strlen(split4)) {
+ if (btrfs_item_size(eb, 1) != strlen(split4)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
@@ -196,8 +190,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- item = btrfs_item_nr(2);
- if (btrfs_item_size(eb, item) != strlen(split2)) {
+ if (btrfs_item_size(eb, 2) != strlen(split2)) {
test_err("invalid len in the second split");
ret = -EINVAL;
goto out;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index df7ce874a74b..350da449db08 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -4,6 +4,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sizes.h>
@@ -20,42 +21,90 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
- struct page *pages[16];
+ struct folio_batch fbatch;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- unsigned long nr_pages = end_index - index + 1;
int i;
int count = 0;
int loops = 0;
- while (nr_pages > 0) {
- ret = find_get_pages_contig(inode->i_mapping, index,
- min_t(unsigned long, nr_pages,
- ARRAY_SIZE(pages)), pages);
+ folio_batch_init(&fbatch);
+
+ while (index <= end_index) {
+ ret = filemap_get_folios_contig(inode->i_mapping, &index,
+ end_index, &fbatch);
for (i = 0; i < ret; i++) {
+ struct folio *folio = fbatch.folios[i];
+
if (flags & PROCESS_TEST_LOCKED &&
- !PageLocked(pages[i]))
+ !folio_test_locked(folio))
count++;
- if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (flags & PROCESS_UNLOCK && folio_test_locked(folio))
+ folio_unlock(folio);
if (flags & PROCESS_RELEASE)
- put_page(pages[i]);
+ folio_put(folio);
}
- nr_pages -= ret;
- index += ret;
+ folio_batch_release(&fbatch);
cond_resched();
loops++;
if (loops > 100000) {
printk(KERN_ERR
- "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n",
- start, end, nr_pages, ret);
+ "stuck in a loop, start %llu, end %llu, ret %d\n",
+ start, end, ret);
break;
}
}
+
return count;
}
+#define STATE_FLAG_STR_LEN 256
+
+#define PRINT_ONE_FLAG(state, dest, cur, name) \
+({ \
+ if (state->state & EXTENT_##name) \
+ cur += scnprintf(dest + cur, STATE_FLAG_STR_LEN - cur, \
+ "%s" #name, cur == 0 ? "" : "|"); \
+})
+
+static void extent_flag_to_str(const struct extent_state *state, char *dest)
+{
+ int cur = 0;
+
+ dest[0] = 0;
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY);
+ PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
+ PRINT_ONE_FLAG(state, dest, cur, LOCKED);
+ PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
+ PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
+ PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
+ PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
+ PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
+ PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
+ PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
+ PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
+}
+
+static void dump_extent_io_tree(const struct extent_io_tree *tree)
+{
+ struct rb_node *node;
+ char flags_str[STATE_FLAG_STR_LEN];
+
+ node = rb_first(&tree->state);
+ test_msg("io tree content:");
+ while (node) {
+ struct extent_state *state;
+
+ state = rb_entry(node, struct extent_state, rb_node);
+ extent_flag_to_str(state, flags_str);
+ test_msg(" start=%llu len=%llu flags=%s", state->start,
+ state->end + 1 - state->start, flags_str);
+ node = rb_next(node);
+ }
+}
+
static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
@@ -112,7 +161,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -124,7 +173,7 @@ static int test_find_delalloc(u32 sectorsize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -143,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize)
}
set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -160,7 +209,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -177,14 +226,14 @@ static int test_find_delalloc(u32 sectorsize)
goto out_bits;
}
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
}
- if (end != (u64)-1) {
+ if (end != test_start + PAGE_SIZE - 1) {
test_err("did not return the proper end offset");
goto out_bits;
}
@@ -198,7 +247,7 @@ static int test_find_delalloc(u32 sectorsize)
*/
set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
@@ -215,7 +264,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
@@ -233,7 +282,7 @@ static int test_find_delalloc(u32 sectorsize)
/* We unlocked it in the previous test */
lock_page(locked_page);
start = test_start;
- end = 0;
+ end = start + PAGE_SIZE - 1;
/*
* Currently if we fail to find dirty pages in the delalloc range we
* will adjust max_bytes down to PAGE_SIZE and then re-search. If
@@ -258,6 +307,8 @@ static int test_find_delalloc(u32 sectorsize)
}
ret = 0;
out_bits:
+ if (ret)
+ dump_extent_io_tree(tmp);
clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
@@ -379,54 +430,50 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- unsigned long len;
unsigned long *bitmap = NULL;
struct extent_buffer *eb = NULL;
int ret;
test_msg("running extent buffer bitmap tests");
- /*
- * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
- * BTRFS_MAX_METADATA_BLOCKSIZE.
- */
- len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE)
- ? sectorsize * 4 : sectorsize;
-
- fs_info = btrfs_alloc_dummy_fs_info(len, len);
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
}
- bitmap = kmalloc(len, GFP_KERNEL);
+ bitmap = kmalloc(nodesize, GFP_KERNEL);
if (!bitmap) {
test_err("couldn't allocate test bitmap");
ret = -ENOMEM;
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
+ eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, len);
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
if (ret)
goto out;
- /* Do it over again with an extent buffer which isn't page-aligned. */
free_extent_buffer(eb);
- eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len);
+
+ /*
+ * Test again for case where the tree block is sectorsize aligned but
+ * not nodesize aligned.
+ */
+ eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, len);
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
out:
free_extent_buffer(eb);
kfree(bitmap);
@@ -538,6 +585,8 @@ static int test_find_first_clear_extent_bit(void)
ret = 0;
out:
+ if (ret)
+ dump_extent_io_tree(&tree);
clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
return ret;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 57379e96ccc9..c5b3a631bf4f 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
+ write_lock(&em_tree->lock);
while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
@@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#endif
free_extent_map(em);
}
+ write_unlock(&em_tree->lock);
}
/*
@@ -507,7 +509,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
@@ -557,7 +559,7 @@ int btrfs_test_extent_map(void)
{
/*
* Test a chunk with 2 data stripes one of which
- * interesects the physical address of the super block
+ * intersects the physical address of the super block
* is correctly recognised.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index aebdf23f0cdd..ebf68fcd2149 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache)
}
/* Cleanup */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/* Now with the extent entry offset into the bitmap */
ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
* [ bitmap ]
* [ del ]
*/
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_err("couldn't add bitmap %d", ret);
@@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* This blew up before, we have part of the free space in a bitmap and
@@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return ret;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -399,7 +399,6 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
u64 offset;
u64 max_extent_size;
const struct btrfs_free_space_op test_free_space_ops = {
- .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
.use_bitmap = test_use_bitmap,
};
const struct btrfs_free_space_op *orig_free_space_ops;
@@ -630,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
if (ret)
return ret;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* Now test a similar scenario, but where our extent entry is located
@@ -820,11 +819,189 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
return ret;
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
+static bool bytes_index_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ return true;
+}
+
+static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
+{
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .use_bitmap = bytes_index_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
+ struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ u64 offset, max_extent_size, bytes;
+ int ret, i;
+
+ test_msg("running bytes index tests");
+
+ /* First just validate that it does everything in order. */
+ offset = 0;
+ for (i = 0; i < 10; i++) {
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 0);
+ if (ret) {
+ test_err("couldn't add extent entry %d\n", ret);
+ return ret;
+ }
+ offset += bytes + sectorsize;
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 9; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps do the correct thing. */
+ btrfs_remove_free_space_cache(cache);
+ for (i = 0; i < 2; i++) {
+ offset = i * BITS_PER_BITMAP * sectorsize;
+ bytes = (i + 1) * SZ_1M;
+ ret = test_add_free_space_entry(cache, offset, bytes, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+ }
+
+ for (node = rb_first_cached(&ctl->free_space_bytes), i = 1; node;
+ node = rb_next(node), i--) {
+ entry = rb_entry(node, struct btrfs_free_space, bytes_index);
+ bytes = (i + 1) * SZ_1M;
+ if (entry->bytes != bytes) {
+ test_err("invalid bytes index order, found %llu expected %llu",
+ entry->bytes, bytes);
+ return -EINVAL;
+ }
+ }
+
+ /* Now validate bitmaps with different ->max_extent_size. */
+ btrfs_remove_free_space_cache(cache);
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
+
+ ret = test_add_free_space_entry(cache, 0, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap entry");
+ return ret;
+ }
+
+ offset = BITS_PER_BITMAP * sectorsize;
+ ret = test_add_free_space_entry(cache, offset, sectorsize, 1);
+ if (ret) {
+ test_err("couldn't add bitmap_entry");
+ return ret;
+ }
+
+ /*
+ * Now set a bunch of sectorsize extents in the first entry so it's
+ * ->bytes is large.
+ */
+ for (i = 2; i < 20; i += 2) {
+ offset = sectorsize * i;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error populating sparse bitmap %d", ret);
+ return ret;
+ }
+ }
+
+ /*
+ * Now set a contiguous extent in the second bitmap so its
+ * ->max_extent_size is larger than the first bitmaps.
+ */
+ offset = (BITS_PER_BITMAP * sectorsize) + sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding contiguous extent %d", ret);
+ return ret;
+ }
+
+ /*
+ * Since we don't set ->max_extent_size unless we search everything
+ * should be indexed on bytes.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (10 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 3,
+ 0, &max_extent_size);
+ if (offset != 0) {
+ test_err("found space to alloc even though we don't have enough space");
+ return -EINVAL;
+ }
+
+ if (max_extent_size != (2 * sectorsize)) {
+ test_err("got the wrong max_extent size %llu expected %llu",
+ max_extent_size, (unsigned long long)(2 * sectorsize));
+ return -EINVAL;
+ }
+
+ /*
+ * The search should have re-arranged the bytes index to use the
+ * ->max_extent_size, validate it's now what we expect it to be.
+ */
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (2 * sectorsize)) {
+ test_err("error, the bytes index wasn't recalculated properly");
+ return -EINVAL;
+ }
+
+ /* Add another sectorsize to re-arrange the tree back to ->bytes. */
+ offset = (BITS_PER_BITMAP * sectorsize) - sectorsize;
+ ret = btrfs_add_free_space(cache, offset, sectorsize);
+ if (ret) {
+ test_err("error adding extent to the sparse entry %d", ret);
+ return ret;
+ }
+
+ entry = rb_entry(rb_first_cached(&ctl->free_space_bytes),
+ struct btrfs_free_space, bytes_index);
+ if (entry->bytes != (11 * sectorsize)) {
+ test_err("error, wrong entry in the first slot in bytes_index");
+ return -EINVAL;
+ }
+
+ /*
+ * Now make sure we find our correct entry after searching that will
+ * result in a re-arranging of the tree.
+ */
+ max_extent_size = 0;
+ offset = btrfs_find_space_for_alloc(cache, cache->start, sectorsize * 2,
+ 0, &max_extent_size);
+ if (offset != (BITS_PER_BITMAP * sectorsize)) {
+ test_err("error, found %llu instead of %llu for our alloc",
+ offset,
+ (unsigned long long)(BITS_PER_BITMAP * sectorsize));
+ return -EINVAL;
+ }
+
+ cache->free_space_ctl->op = orig_free_space_ops;
+ btrfs_remove_free_space_cache(cache);
+ return 0;
+}
+
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
@@ -859,7 +1036,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
goto out;
}
- root->fs_info->extent_root = root;
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
ret = test_extents(cache);
if (ret)
@@ -872,6 +1052,9 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
goto out;
ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize);
+ if (ret)
+ goto out;
+ ret = test_bytes_index(cache, sectorsize);
out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index 914eea5ba6a7..13734ed43bfc 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -60,8 +60,6 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
- if (i >= num_extents)
- goto invalid;
if (i >= num_extents ||
extent_start != extents[i].start ||
offset - extent_start != extents[i].length)
@@ -448,7 +446,10 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
- root->fs_info->free_space_root = root;
+ root->root_key.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 24a8c714f56c..625f7d398368 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ btrfs_setup_item_for_insert(root, &path, &key, value_len);
}
/*
@@ -234,11 +232,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
return ret;
}
- inode->i_mode = S_IFREG;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
-
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
@@ -274,7 +267,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* All of the magic numbers are based on the mapping setup in
@@ -837,10 +830,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
return ret;
}
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
-
fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
@@ -951,11 +940,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
- ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0,
- NULL);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
+ BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL);
if (ret) {
test_err("btrfs_set_extent_delalloc returned %d", ret);
goto out;
@@ -968,7 +956,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE,
BTRFS_MAX_EXTENT_SIZE + sectorsize - 1,
0, NULL);
if (ret) {
@@ -986,7 +974,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -999,7 +988,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */
- ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1)
+ sectorsize - 1,
0, NULL);
@@ -1017,7 +1006,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize,
(BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1,
0, NULL);
@@ -1035,7 +1024,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/*
* [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize]
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
@@ -1053,7 +1042,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1069,7 +1059,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
* Refill the hole again just for good measure, because I thought it
* might fail and I'd rather satisfy my paranoia at this point.
*/
- ret = btrfs_set_extent_delalloc(inode,
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode),
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL);
if (ret) {
@@ -1085,7 +1075,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1100,7 +1091,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_UPTODATE, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ac035a6fa003..63676ea19f29 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -36,7 +36,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
test_err("couldn't insert ref %d", ret);
@@ -86,7 +85,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
@@ -135,7 +133,6 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
@@ -170,7 +167,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
@@ -227,24 +223,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -256,31 +250,31 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -328,24 +322,22 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -363,24 +355,22 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -404,24 +394,22 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -469,7 +457,10 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
}
/* We are using this root as our extent root */
- root->fs_info->extent_root = root;
+ root->root_key.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ btrfs_global_root_insert(root);
/*
* Some of the paths we test assume we have a filled out fs_info, so we
@@ -507,6 +498,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
test_err("couldn't insert fs root %d", ret);
goto out;
}
+ btrfs_put_root(tmp_root);
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
@@ -521,6 +513,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
test_err("couldn't insert fs root %d", ret);
goto out;
}
+ btrfs_put_root(tmp_root);
test_msg("running qgroup tests");
ret = test_no_shared_qgroup(root, sectorsize, nodesize);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index beb6c69cd1e5..d1f1da6820fb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -10,17 +10,19 @@
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include <linux/timekeeping.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "locking.h"
#include "tree-log.h"
-#include "inode-map.h"
#include "volumes.h"
#include "dev-replace.h"
#include "qgroup.h"
#include "block-group.h"
+#include "space-info.h"
+#include "zoned.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -107,6 +109,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK |
__TRANS_JOIN_NOSTART),
+ [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
[TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
@@ -141,7 +148,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
struct btrfs_block_group,
bg_list);
list_del_init(&cache->bg_list);
- btrfs_put_block_group_trimming(cache);
+ btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
WARN_ON(!list_empty(&transaction->dev_update_list));
@@ -155,14 +162,22 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);
+
down_write(&fs_info->commit_root_sem);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ fs_info->last_reloc_trans = trans->transid;
+
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
dirty_list) {
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->root_key.objectid))
- btrfs_unpin_free_ino(root);
extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -179,6 +194,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
spin_lock(&cur_trans->dropped_roots_lock);
}
spin_unlock(&cur_trans->dropped_roots_lock);
+
up_write(&fs_info->commit_root_sem);
}
@@ -208,8 +224,11 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
}
/*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
@@ -218,10 +237,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
if (!trans->chunk_bytes_reserved)
return;
- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
- trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved, NULL);
trans->chunk_bytes_reserved = 0;
}
@@ -236,14 +253,14 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
cur_trans = fs_info->running_transaction;
if (cur_trans) {
- if (cur_trans->aborted) {
+ if (TRANS_ABORTED(cur_trans)) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
@@ -255,6 +272,8 @@ loop:
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
return 0;
}
spin_unlock(&fs_info->trans_lock);
@@ -276,21 +295,30 @@ loop:
if (!cur_trans)
return -ENOMEM;
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+
spin_lock(&fs_info->trans_lock);
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
goto loop;
- } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ } else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
return -EROFS;
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -333,9 +361,13 @@ loop:
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
+ INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+ spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+ IO_TREE_TRANS_DIRTY_PAGES, NULL);
+ extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -346,20 +378,20 @@ loop:
}
/*
- * this does all the record keeping required to make sure that a reference
- * counted root is properly recorded in a given transaction. This is required
- * to make sure the old root from before we joined the transaction is deleted
- * when the transaction commits
+ * This does all the record keeping required to make sure that a shareable root
+ * is properly recorded in a given transaction. This is required to make sure
+ * the old root from before we joined the transaction is deleted when the
+ * transaction commits.
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
- if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
- WARN_ON(root == fs_info->extent_root);
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -404,11 +436,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* lock. smp_wmb() makes sure that all the writes above are
* done before we pop in the zero below
*/
- btrfs_init_reloc_root(trans, root);
+ ret = btrfs_init_reloc_root(trans, root);
smp_mb__before_atomic();
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
- return 0;
+ return ret;
}
@@ -435,8 +467,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
/*
@@ -449,17 +482,17 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&fs_info->reloc_mutex);
- record_root_in_trans(trans, root, 0);
+ ret = record_root_in_trans(trans, root, 0);
mutex_unlock(&fs_info->reloc_mutex);
- return 0;
+ return ret;
}
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
- !trans->aborted);
+ !TRANS_ABORTED(trans));
}
/* wait for commit against the current transaction to become unblocked
@@ -476,9 +509,10 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
- cur_trans->aborted);
+ TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&fs_info->trans_lock);
@@ -501,7 +535,7 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!fs_info->reloc_ctl ||
- !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
@@ -521,12 +555,10 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
bool reloc_reserved = false;
+ bool do_chunk_alloc = false;
int ret;
- /* Send isn't supposed to start transactions. */
- ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
-
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (BTRFS_FS_ERROR(fs_info))
return ERR_PTR(-EROFS);
if (current->journal_info) {
@@ -561,7 +593,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* refill that amount for whatever is missing in the reserve.
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- if (delayed_refs_rsv->full == 0) {
+ if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
}
@@ -574,7 +607,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
+ ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
if (ret)
goto reserve_fail;
if (delayed_refs_bytes) {
@@ -582,8 +615,11 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
delayed_refs_bytes);
num_bytes -= delayed_refs_bytes;
}
+
+ if (rsv->space_info->force_alloc)
+ do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
- !delayed_refs_rsv->full) {
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
/*
* Some people call with btrfs_start_transaction(root, 0)
* because they can be throttled, but have some other mechanism
@@ -635,12 +671,10 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->root = root;
refcount_set(&h->use_count, 1);
h->fs_info = root->fs_info;
h->type = type;
- h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
@@ -660,10 +694,41 @@ again:
}
got_it:
- btrfs_record_root_in_trans(h, root);
-
if (!current->journal_info)
current->journal_info = h;
+
+ /*
+ * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
+ * ALLOC_FORCE the first run through, and then we won't allocate for
+ * anybody else who races in later. We don't care about the return
+ * value here.
+ */
+ if (do_chunk_alloc && num_bytes) {
+ u64 flags = h->block_rsv->space_info->flags;
+
+ btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
+ CHUNK_ALLOC_NO_FORCE);
+ }
+
+ /*
+ * btrfs_record_root_in_trans() needs to alloc new extents, and may
+ * call btrfs_join_transaction() while we're also starting a
+ * transaction.
+ *
+ * Thus it need to be called after current->journal_info initialized,
+ * or we can deadlock.
+ */
+ ret = btrfs_record_root_in_trans(h, root);
+ if (ret) {
+ /*
+ * The transaction handle is fully initialized and linked with
+ * other structures so it needs to be ended in case of errors,
+ * not just freed.
+ */
+ btrfs_end_transaction(h);
+ return ERR_PTR(ret);
+ }
+
return h;
join_fail:
@@ -673,7 +738,7 @@ join_fail:
alloc_fail:
if (num_bytes)
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes);
+ num_bytes, NULL);
reserve_fail:
btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -688,43 +753,10 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
- unsigned int num_items,
- int min_factor)
+ unsigned int num_items)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_trans_handle *trans;
- u64 num_bytes;
- int ret;
-
- /*
- * We have two callers: unlink and block group removal. The
- * former should succeed even if we will temporarily exceed
- * quota and the latter operates on the extent root so
- * qgroup enforcement is ignored anyway.
- */
- trans = start_transaction(root, num_items, TRANS_START,
- BTRFS_RESERVE_FLUSH_ALL, false);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
-
- num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
- num_bytes, min_factor);
- if (ret) {
- btrfs_end_transaction(trans);
- return ERR_PTR(ret);
- }
-
- trans->block_rsv = &fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, num_bytes, 1);
-
- return trans;
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -788,10 +820,50 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
return trans;
}
-/* wait for a transaction commit to be fully complete */
-static noinline void wait_for_commit(struct btrfs_transaction *commit)
+/* Wait for a transaction commit to reach at least the given state. */
+static noinline void wait_for_commit(struct btrfs_transaction *commit,
+ const enum btrfs_trans_state min_state)
{
- wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
+ struct btrfs_fs_info *fs_info = commit->fs_info;
+ u64 transid = commit->transid;
+ bool put = false;
+
+ /*
+ * At the moment this function is called with min_state either being
+ * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
+ */
+ if (min_state == TRANS_STATE_COMPLETED)
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ else
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
+ while (1) {
+ wait_event(commit->commit_wait, commit->state >= min_state);
+ if (put)
+ btrfs_put_transaction(commit);
+
+ if (min_state < TRANS_STATE_COMPLETED)
+ break;
+
+ /*
+ * A transaction isn't really completed until all of the
+ * previous transactions are completed, but with fsync we can
+ * end up with SUPER_COMMITTED transactions before a COMPLETED
+ * transaction. Wait for those.
+ */
+
+ spin_lock(&fs_info->trans_lock);
+ commit = list_first_entry_or_null(&fs_info->trans_list,
+ struct btrfs_transaction,
+ list);
+ if (!commit || commit->transid > transid) {
+ spin_unlock(&fs_info->trans_lock);
+ break;
+ }
+ refcount_inc(&commit->use_count);
+ put = true;
+ spin_unlock(&fs_info->trans_lock);
+ }
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -846,7 +918,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
goto out; /* nothing committing|committed */
}
- wait_for_commit(cur_trans);
+ wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
btrfs_put_transaction(cur_trans);
out:
return ret;
@@ -857,24 +929,23 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
wait_current_trans(fs_info);
}
-static int should_end_transaction(struct btrfs_trans_handle *trans)
+static bool should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
if (btrfs_check_space_for_delayed_refs(fs_info))
- return 1;
+ return true;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
}
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
- cur_trans->delayed_refs.flushing)
- return 1;
+ test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
+ return true;
return should_end_transaction(trans);
}
@@ -896,7 +967,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
+ trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
}
@@ -929,6 +1000,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_dec(cur_trans, trans->type);
cond_wake_up(&cur_trans->writer_wait);
+
+ btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(info, btrfs_trans_num_writers);
+
btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
@@ -937,10 +1012,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (trans->aborted ||
- test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+ if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
wake_up_process(info->transaction_kthread);
- err = -EIO;
+ if (TRANS_ABORTED(trans))
+ err = trans->aborted;
+ else
+ err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -1039,7 +1116,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* it's safe to do it (through extent_io_tree_release()).
*/
err = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, 0, 0, &cached_state);
+ EXTENT_NEED_WAIT, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err)
@@ -1180,19 +1257,21 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
struct extent_buffer *eb;
int ret;
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
+
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
-
ret = btrfs_run_dev_stats(trans);
if (ret)
return ret;
@@ -1207,10 +1286,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
if (ret)
return ret;
- /* run_qgroups might have added some more refs */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
@@ -1219,21 +1294,29 @@ again:
root = list_entry(next, struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
- if (root != fs_info->extent_root)
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
}
+ /* Now flush any delayed refs generated by updating all of the roots */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret)
+ return ret;
+
while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
ret = btrfs_write_dirty_block_groups(trans);
if (ret)
return ret;
+
+ /*
+ * We're writing the dirty block groups, which could generate
+ * delayed refs, which could generate more dirty block groups,
+ * so we want to keep this flushing in this loop to make sure
+ * everything gets run.
+ */
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
@@ -1242,9 +1325,6 @@ again:
if (!list_empty(&fs_info->dirty_cowonly_roots))
goto again;
- list_add_tail(&fs_info->extent_root->dirty_list,
- &trans->transaction->switch_commits);
-
/* Update dev-replace pointer once everything is committed */
fs_info->dev_replace.committed_cursor_left =
fs_info->dev_replace.cursor_left_last_write_of_item;
@@ -1253,6 +1333,32 @@ again:
}
/*
+ * If we had a pending drop we need to see if there are any others left in our
+ * dead roots list, and if not clear our bit and wake any waiters.
+ */
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * We put the drop in progress roots at the front of the list, so if the
+ * first entry doesn't have UNFINISHED_DROP set we can wake everybody
+ * up.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root,
+ root_list);
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
+ spin_unlock(&fs_info->trans_lock);
+ return;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_wake_unfinished_drop(fs_info);
+}
+
+/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
@@ -1262,13 +1368,21 @@ void btrfs_add_dead_root(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
- if (list_empty(&root->root_list))
- list_add_tail(&root->root_list, &fs_info->dead_roots);
+ if (list_empty(&root->root_list)) {
+ btrfs_grab_root(root);
+
+ /* We want to process the partially complete drops first. */
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
+ list_add(&root->root_list, &fs_info->dead_roots);
+ else
+ list_add_tail(&root->root_list, &fs_info->dead_roots);
+ }
spin_unlock(&fs_info->trans_lock);
}
/*
- * update all the cowonly tree roots on disk
+ * Update each subvolume root and its relocation root, if it exists, in the tree
+ * of tree roots. Also free log roots if they exist.
*/
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
@@ -1276,7 +1390,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *gang[8];
int i;
int ret;
- int err = 0;
+
+ /*
+ * At this point no one can be using this transaction to modify any tree
+ * and no one can start another transaction to modify any tree either.
+ */
+ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
@@ -1288,15 +1407,25 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
break;
for (i = 0; i < ret; i++) {
struct btrfs_root *root = gang[i];
+ int ret2;
+
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
- btrfs_update_reloc_root(trans, root);
-
- btrfs_save_ino_cache(root, trans);
+ ret2 = btrfs_update_reloc_root(trans, root);
+ if (ret2)
+ return ret2;
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
@@ -1309,17 +1438,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
root->node);
}
- err = btrfs_update_root(trans, fs_info->tree_root,
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
+ if (ret2)
+ return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
- if (err)
- break;
btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- return err;
+ return 0;
}
/*
@@ -1337,8 +1466,10 @@ int btrfs_defrag_root(struct btrfs_root *root)
while (1) {
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
ret = btrfs_defrag_leaves(trans, root);
@@ -1389,13 +1520,26 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* recorded root will never be updated again, causing an outdated root
* item.
*/
- record_root_in_trans(trans, src, 1);
+ ret = record_root_in_trans(trans, src, 1);
+ if (ret)
+ return ret;
/*
- * We are going to commit transaction, see btrfs_commit_transaction()
- * comment for reason locking tree_log_mutex
+ * btrfs_qgroup_inherit relies on a consistent view of the usage for the
+ * src root, so we must run the delayed refs here.
+ *
+ * However this isn't particularly fool proof, because there's no
+ * synchronization keeping us from changing the tree after this point
+ * before we do the qgroup_inherit, or even from making changes while
+ * we're doing the qgroup_inherit. But that's a problem for the future,
+ * for now flush the delayed refs to narrow the race window where the
+ * qgroup counters could end up wrong.
*/
- mutex_lock(&fs_info->tree_log_mutex);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = commit_fs_roots(trans);
if (ret)
@@ -1432,8 +1576,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
"Error while writing out transaction for qgroup");
out:
- mutex_unlock(&fs_info->tree_log_mutex);
-
/*
* Force parent root to be updated, as we recorded it before so its
* last_trans == cur_transid.
@@ -1441,7 +1583,7 @@ out:
* insert_dir_item()
*/
if (!ret)
- record_root_in_trans(trans, parent, 1);
+ ret = record_root_in_trans(trans, parent, 1);
return ret;
}
@@ -1477,7 +1619,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 index = 0;
u64 objectid;
u64 root_flags;
- uuid_le new_uuid;
ASSERT(pending->path);
path = pending->path;
@@ -1485,7 +1626,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ASSERT(pending->root_item);
new_root_item = pending->root_item;
- pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+ pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
goto no_free_objectid;
@@ -1498,7 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
- pending->error = btrfs_block_rsv_add(root,
+ pending->error = btrfs_block_rsv_add(fs_info,
&pending->block_rsv,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
@@ -1519,8 +1660,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root, 0);
-
+ ret = record_root_in_trans(trans, parent_root, 0);
+ if (ret)
+ goto fail;
cur_time = current_time(parent_inode);
/*
@@ -1556,7 +1698,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- record_root_in_trans(trans, root, 0);
+ ret = record_root_in_trans(trans, root, 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -1570,8 +1716,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_generation_v2(new_root_item,
trans->transid);
- uuid_le_gen(&new_uuid);
- memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(new_root_item->uuid);
memcpy(new_root_item->parent_uuid, root->root_item.uuid,
BTRFS_UUID_SIZE);
if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
@@ -1587,7 +1732,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -1595,8 +1741,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking_write(old);
-
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
btrfs_tree_unlock(old);
@@ -1633,9 +1777,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1646,12 +1791,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
/*
* Do special qgroup accounting for snapshot, as we do some qgroup
* snapshot hack to do fast snapshot.
@@ -1675,14 +1814,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
dentry->d_name.len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime =
- current_time(parent_inode);
- ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+ parent_inode->i_mtime = current_time(parent_inode);
+ parent_inode->i_ctime = parent_inode->i_mtime;
+ ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
- ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
+ ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1698,12 +1838,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
fail:
pending->error = ret;
dir_item_existed:
@@ -1756,6 +1890,8 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->root_level = root_item->level;
if (btrfs_test_opt(fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
+ else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
+ super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
}
@@ -1786,74 +1922,14 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
return ret;
}
-/*
- * wait for the current transaction commit to start and block subsequent
- * transaction joins
- */
-static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *trans)
-{
- wait_event(fs_info->transaction_blocked_wait,
- trans->state >= TRANS_STATE_COMMIT_START || trans->aborted);
-}
-
-/*
- * wait for the current transaction to start and then become unblocked.
- * caller holds ref.
- */
-static void wait_current_trans_commit_start_and_unblock(
- struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *trans)
-{
- wait_event(fs_info->transaction_wait,
- trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted);
-}
-
-/*
- * commit transactions asynchronously. once btrfs_commit_transaction_async
- * returns, any subsequent transaction will not be allowed to join.
- */
-struct btrfs_async_commit {
- struct btrfs_trans_handle *newtrans;
- struct work_struct work;
-};
-
-static void do_async_commit(struct work_struct *work)
-{
- struct btrfs_async_commit *ac =
- container_of(work, struct btrfs_async_commit, work);
-
- /*
- * We've got freeze protection passed with the transaction.
- * Tell lockdep about it.
- */
- if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
-
- current->journal_info = ac->newtrans;
-
- btrfs_commit_transaction(ac->newtrans);
- kfree(ac);
-}
-
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- int wait_for_unblock)
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_async_commit *ac;
struct btrfs_transaction *cur_trans;
- ac = kmalloc(sizeof(*ac), GFP_NOFS);
- if (!ac)
- return -ENOMEM;
-
- INIT_WORK(&ac->work, do_async_commit);
- ac->newtrans = btrfs_join_transaction(trans->root);
- if (IS_ERR(ac->newtrans)) {
- int err = PTR_ERR(ac->newtrans);
- kfree(ac);
- return err;
- }
+ /* Kick the transaction kthread. */
+ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+ wake_up_process(fs_info->transaction_kthread);
/* take transaction reference */
cur_trans = trans->transaction;
@@ -1862,28 +1938,16 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
btrfs_end_transaction(trans);
/*
- * Tell lockdep we've released the freeze rwsem, since the
- * async commit thread will be the one to unlock it.
+ * Wait for the current transaction commit to start and block
+ * subsequent transaction joins
*/
- if (ac->newtrans->type & __TRANS_FREEZABLE)
- __sb_writers_release(fs_info->sb, SB_FREEZE_FS);
-
- schedule_work(&ac->work);
-
- /* wait for transaction to start and unblock */
- if (wait_for_unblock)
- wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
- else
- wait_current_trans_commit_start(fs_info, cur_trans);
-
- if (current->journal_info == trans)
- current->journal_info = NULL;
-
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ wait_event(fs_info->transaction_blocked_wait,
+ cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
- return 0;
}
-
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1902,15 +1966,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
*/
BUG_ON(list_empty(&cur_trans->list));
- list_del_init(&cur_trans->list);
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has already released the lockdep map as reader
+ * already in btrfs_commit_transaction().
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
spin_lock(&fs_info->trans_lock);
}
+
+ /*
+ * Now that we know no one else is still using the transaction we can
+ * remove the transaction from the list of transactions. This avoids
+ * the transaction kthread from cleaning up the transaction while some
+ * other task is still using it, which could result in a use-after-free
+ * on things like log trees, as it forces the transaction kthread to
+ * wait for this transaction to be cleaned up by us.
+ */
+ list_del_init(&cur_trans->list);
+
spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
@@ -1925,7 +2005,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
- trace_btrfs_transaction_commit(trans->root);
+ trace_btrfs_transaction_commit(fs_info);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -1949,63 +2029,64 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
}
}
-static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
/*
- * We use writeback_inodes_sb here because if we used
+ * We use try_to_writeback_inodes_sb() here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
* Currently are holding the fs freeze lock, if we do an async flush
* we'll do btrfs_join_transaction() and deadlock because we need to
* wait for the fs freeze lock. Using the direct flushing we benefit
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
+ *
+ * Note that try_to_writeback_inodes_sb() will only trigger writeback
+ * if it can read lock sb->s_umount. It will always be able to lock it,
+ * except when the filesystem is being unmounted or being frozen, but in
+ * those cases sync_filesystem() is called, which results in calling
+ * writeback_inodes_sb() while holding a write lock on sb->s_umount.
+ * Note that we don't call writeback_inodes_sb() directly, because it
+ * will emit a warning if sb->s_umount is not locked.
*/
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
- writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
- } else {
- struct btrfs_pending_snapshot *pending;
- struct list_head *head = &trans->transaction->pending_snapshots;
-
- /*
- * Flush dellaloc for any root that is going to be snapshotted.
- * This is done to avoid a corrupted version of files, in the
- * snapshots, that had both buffered and direct IO writes (even
- * if they were done sequentially) due to an unordered update of
- * the inode's size on disk.
- */
- list_for_each_entry(pending, head, list) {
- int ret;
-
- ret = btrfs_start_delalloc_snapshot(pending->root);
- if (ret)
- return ret;
- }
- }
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
+ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
}
-static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- } else {
- struct btrfs_pending_snapshot *pending;
- struct list_head *head = &trans->transaction->pending_snapshots;
+}
- /*
- * Wait for any dellaloc that we started previously for the roots
- * that are going to be snapshotted. This is to avoid a corrupted
- * version of files in the snapshots that had both buffered and
- * direct IO writes (even if they were done sequentially).
- */
- list_for_each_entry(pending, head, list)
- btrfs_wait_ordered_extents(pending->root,
- U64_MAX, 0, U64_MAX);
- }
+/*
+ * Add a pending snapshot associated with the given transaction handle to the
+ * respective handle. This must be called after the transaction commit started
+ * and while holding fs_info->trans_lock.
+ * This serves to guarantee a caller of btrfs_commit_transaction() that it can
+ * safely free the pending snapshot pointer in case btrfs_commit_transaction()
+ * returns an error.
+ */
+static void add_pending_snapshot(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ if (!trans->pending_snapshot)
+ return;
+
+ lockdep_assert_held(&trans->fs_info->trans_lock);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
+
+ list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
+}
+
+static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+{
+ fs_info->commit_stats.commit_count++;
+ fs_info->commit_stats.last_commit_dur = interval;
+ fs_info->commit_stats.max_commit_dur =
+ max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
+ fs_info->commit_stats.total_commit_dur += interval;
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2014,53 +2095,38 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ktime_t start_time;
+ ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
-
- /*
- * Some places just start a transaction to commit it. We need to make
- * sure that if this commit fails that the abort code actually marks the
- * transaction as failed, so set trans->dirty to make the abort code do
- * the right thing.
- */
- trans->dirty = true;
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
/* Stop the commit early if ->aborted is set */
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- btrfs_end_transaction(trans);
- return ret;
+ goto lockdep_trans_commit_start_release;
}
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- /* make a pass through all the delayed refs we have so far
- * any runnings procs may add more while we are here
- */
- ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
-
- cur_trans = trans->transaction;
-
/*
- * set the flushing flag so procs in this transaction have to
- * start sending their work down.
+ * We only want one transaction commit doing the flushing so we do not
+ * waste a bunch of time on lock contention on the extent root node.
*/
- cur_trans->delayed_refs.flushing = 1;
- smp_wmb();
+ if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
+ &cur_trans->delayed_refs.flags)) {
+ /*
+ * Make a pass through all the delayed refs we have so far.
+ * Any running threads may add more while we are here.
+ */
+ ret = btrfs_run_delayed_refs(trans, 0);
+ if (ret)
+ goto lockdep_trans_commit_start_release;
+ }
btrfs_create_pending_block_groups(trans);
- ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
-
if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
@@ -2085,22 +2151,29 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (run_it) {
ret = btrfs_start_dirty_block_groups(trans);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
}
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
+ add_pending_snapshot(trans);
+
spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count);
- ret = btrfs_end_transaction(trans);
- wait_for_commit(cur_trans);
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+
+ btrfs_trans_state_lockdep_release(fs_info,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ ret = btrfs_end_transaction(trans);
+ wait_for_commit(cur_trans, want_state);
- if (unlikely(cur_trans->aborted))
+ if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
@@ -2110,20 +2183,27 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
if (cur_trans->list.prev != &fs_info->trans_list) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
- if (prev_trans->state != TRANS_STATE_COMPLETED) {
+ if (prev_trans->state < want_state) {
refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
- wait_for_commit(prev_trans);
- ret = prev_trans->aborted;
+ wait_for_commit(prev_trans, want_state);
+
+ ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
} else {
spin_unlock(&fs_info->trans_lock);
}
@@ -2135,31 +2215,55 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* abort to prevent writing a new superblock that reflects a
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
- if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
- goto cleanup_transaction;
+ goto lockdep_release;
}
}
+ /*
+ * Get the time spent on the work done by the commit thread and not
+ * the time spent waiting on a previous commit
+ */
+ start_time = ktime_get_ns();
+
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(trans);
+ ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
ret = btrfs_run_delayed_items(trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
wait_event(cur_trans->writer_wait,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
ret = btrfs_run_delayed_items(trans);
- if (ret)
+ if (ret) {
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
goto cleanup_transaction;
+ }
- btrfs_wait_delalloc_flush(trans);
+ btrfs_wait_delalloc_flush(fs_info);
+
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
btrfs_scrub_pause(fs_info);
/*
@@ -2168,14 +2272,40 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&fs_info->trans_lock);
+ add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- /* ->aborted might be set after the previous check, so check it */
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ /*
+ * Make lockdep happy by acquiring the state locks after
+ * btrfs_trans_num_writers is released. If we acquired the state locks
+ * before releasing the btrfs_trans_num_writers lock then lockdep would
+ * complain because we did not follow the reverse order unlocking rule.
+ */
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+ /*
+ * We've started the commit, clear the flag in case we were triggered to
+ * do an async commit but somebody else started before the transaction
+ * kthread could do the work.
+ */
+ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
+
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
goto scrub_continue;
}
/*
@@ -2191,10 +2321,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* core function of the snapshot creation.
*/
ret = create_pending_snapshots(trans);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* We insert the dir indexes of the snapshots and update the inode
@@ -2207,16 +2335,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* make sure none of the code above managed to slip in a
@@ -2226,27 +2350,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
WARN_ON(cur_trans != trans->transaction);
- /* btrfs_commit_tree_roots is responsible for getting the
- * various roots consistent with each other. Every pointer
- * in the tree of tree roots has to point to the most up to date
- * root for every subvolume and other tree. So, we have to keep
- * the tree logging code from jumping in and changing any
- * of the trees.
- *
- * At this point in the commit, there can't be any tree-log
- * writers, but a little lower down we drop the trans mutex
- * and let new people in. By holding the tree_log_mutex
- * from now until after the super is written, we avoid races
- * with the tree-log code.
- */
- mutex_lock(&fs_info->tree_log_mutex);
-
ret = commit_fs_roots(trans);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* Since the transaction is done, we can apply the pending changes
@@ -2260,47 +2366,26 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_free_log_root_tree(trans, fs_info);
/*
- * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
- * new delayed refs. Must handle them or qgroup can be wrong.
- */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
-
- /*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
ret = btrfs_qgroup_account_extents(trans);
- if (ret < 0) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret < 0)
+ goto unlock_reloc;
ret = commit_cowonly_roots(trans);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* The tasks which save the space cache and inode cache may also
* update ->aborted, check it.
*/
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
+ goto unlock_reloc;
}
- btrfs_prepare_extent_commit(fs_info);
-
cur_trans = fs_info->running_transaction;
btrfs_set_root_node(&fs_info->tree_root->root_item,
@@ -2313,6 +2398,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_set_root_node(&fs_info->block_group_root->root_item,
+ fs_info->block_group_root->node);
+ list_add_tail(&fs_info->block_group_root->dirty_list,
+ &cur_trans->switch_commits);
+ }
+
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
@@ -2331,6 +2423,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_trans_release_chunk_metadata(trans);
+ /*
+ * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
+ * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
+ * make sure that before we commit our superblock, no other task can
+ * start a new transaction and commit a log tree before we commit our
+ * superblock. Anyone trying to commit a log tree locks this mutex before
+ * writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
fs_info->running_transaction = NULL;
@@ -2338,6 +2440,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->reloc_mutex);
wake_up(&fs_info->transaction_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
@@ -2347,6 +2450,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
+ /*
+ * At this point, we should have written all the tree blocks allocated
+ * in this transaction. So it's now safe to free the redirtyied extent
+ * buffers.
+ */
+ btrfs_free_redirty_list(cur_trans);
+
ret = write_all_supers(fs_info, 0);
/*
* the super is written, we can safely allow the tree-loggers
@@ -2356,6 +2466,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+ wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
btrfs_finish_extent_commit(trans);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
@@ -2368,7 +2486,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
- clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
@@ -2380,7 +2498,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(fs_info->sb);
- trace_btrfs_transaction_commit(trans->root);
+ trace_btrfs_transaction_commit(fs_info);
+
+ interval = ktime_get_ns() - start_time;
btrfs_scrub_continue(fs_info);
@@ -2389,9 +2509,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ update_commit_stats(fs_info, interval);
+
return ret;
+unlock_reloc:
+ mutex_unlock(&fs_info->reloc_mutex);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
scrub_continue:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
@@ -2404,6 +2531,16 @@ cleanup_transaction:
cleanup_transaction(trans, ret);
return ret;
+
+lockdep_release:
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ goto cleanup_transaction;
+
+lockdep_trans_commit_start_release:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_end_transaction(trans);
+ return ret;
}
/*
@@ -2416,10 +2553,10 @@ cleanup_transaction:
* because btrfs_commit_super will poke cleaner thread and it will process it a
* few seconds later.
*/
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root;
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
@@ -2437,10 +2574,11 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+ ret = btrfs_drop_snapshot(root, 0, 0);
else
- ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+ ret = btrfs_drop_snapshot(root, 1, 0);
+ btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
}
@@ -2453,16 +2591,6 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
if (!prev)
return;
- bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
- if (prev & bit)
- btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
- prev &= ~bit;
-
- bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
- if (prev & bit)
- btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
- prev &= ~bit;
-
bit = 1 << BTRFS_PENDING_COMMIT;
if (prev & bit)
btrfs_debug(fs_info, "pending commit done");
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 49f7196368f5..970ff316069d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -16,6 +16,7 @@ enum btrfs_trans_state {
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
+ TRANS_STATE_SUPER_COMMITTED,
TRANS_STATE_COMPLETED,
TRANS_STATE_MAX,
};
@@ -71,6 +72,7 @@ struct btrfs_transaction {
*/
struct list_head io_bgs;
struct list_head dropped_roots;
+ struct extent_io_tree pinned_extents;
/*
* we need to make sure block group deletion doesn't race with
@@ -84,6 +86,16 @@ struct btrfs_transaction {
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
+
+ spinlock_t releasing_ebs_lock;
+ struct list_head releasing_ebs;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -103,8 +115,6 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB ((void *)1)
-
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
@@ -113,19 +123,32 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ /* Set by a task that wants to create a snapshot. */
+ struct btrfs_pending_snapshot *pending_snapshot;
refcount_t use_count;
unsigned int type;
+ /*
+ * Error code of transaction abort, set outside of locks and must use
+ * the READ_ONCE/WRITE_ONCE access
+ */
short aborted;
bool adding_csums;
bool allocating_chunk;
- bool can_flush_pending_bgs;
+ bool removing_chunk;
bool reloc_reserved;
- bool dirty;
- struct btrfs_root *root;
+ bool in_fsync;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
};
+/*
+ * The abort status can be changed between calls and is not protected by locks.
+ * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
+ * set to a non-zero value it does not change, so the macro should be in checks
+ * but is not necessary for further reads of the value.
+ */
+#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
+
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
@@ -138,18 +161,20 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv;
/* extra metadata reservation for relocation */
int error;
+ /* Preallocated anonymous block device number */
+ dev_t anon_dev;
bool readonly;
struct list_head list;
};
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_trans = trans->transaction->transid;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_trans = trans->transaction->transid;
+ inode->last_sub_trans = inode->root->log_transid;
+ inode->last_log_commit = inode->last_sub_trans - 1;
+ spin_unlock(&inode->lock);
}
/*
@@ -180,8 +205,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
unsigned int num_items);
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
- unsigned int num_items,
- int min_factor);
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
@@ -192,26 +216,12 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- int wait_for_unblock);
-
-/*
- * Try to commit transaction asynchronously, so this is safe to call
- * even holding a spinlock.
- *
- * It's done by informing transaction_kthread to commit transaction without
- * waiting for commit interval.
- */
-static inline void btrfs_commit_transaction_locksafe(
- struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
- wake_up_process(fs_info->transaction_kthread);
-}
+void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a92f8a6dd192..43f905ab0a18 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -24,6 +24,7 @@
#include "compression.h"
#include "volumes.h"
#include "misc.h"
+#include "btrfs_inode.h"
/*
* Error message should follow the following format:
@@ -100,7 +101,8 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
*/
#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \
({ \
- if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
+ if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)))) \
file_extent_err((leaf), (slot), \
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)), \
@@ -200,10 +202,10 @@ static int check_extent_data_item(struct extent_buffer *leaf,
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_file_extent_item *fi;
u32 sectorsize = fs_info->sectorsize;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u64 extent_end;
- if (!IS_ALIGNED(key->offset, sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
file_extent_err(leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
key->offset, sectorsize);
@@ -216,7 +218,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* But if objectids mismatch, it means we have a missing
* INODE_ITEM.
*/
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
@@ -225,14 +227,15 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Make sure the item contains at least inline header, so the file
* extent type is not some garbage.
*/
- if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
+ if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) {
file_extent_err(leaf, slot,
"invalid item size, have %u expect [%zu, %u)",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
SZ_4K);
return -EUCLEAN;
}
- if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) {
+ if (unlikely(btrfs_file_extent_type(leaf, fi) >=
+ BTRFS_NR_FILE_EXTENT_TYPES)) {
file_extent_err(leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
@@ -244,14 +247,15 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
- if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) {
+ if (unlikely(btrfs_file_extent_compression(leaf, fi) >=
+ BTRFS_NR_COMPRESS_TYPES)) {
file_extent_err(leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
BTRFS_NR_COMPRESS_TYPES - 1);
return -EUCLEAN;
}
- if (btrfs_file_extent_encryption(leaf, fi)) {
+ if (unlikely(btrfs_file_extent_encryption(leaf, fi))) {
file_extent_err(leaf, slot,
"invalid encryption for file extent, have %u expect 0",
btrfs_file_extent_encryption(leaf, fi));
@@ -259,7 +263,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
/* Inline extent must have 0 as key offset */
- if (key->offset) {
+ if (unlikely(key->offset)) {
file_extent_err(leaf, slot,
"invalid file_offset for inline file extent, have %llu expect 0",
key->offset);
@@ -272,8 +276,8 @@ static int check_extent_data_item(struct extent_buffer *leaf,
return 0;
/* Uncompressed inline extent size must match item size */
- if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
- btrfs_file_extent_ram_bytes(leaf, fi)) {
+ if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi))) {
file_extent_err(leaf, slot,
"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
@@ -284,22 +288,22 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
/* Regular or preallocated extent has fixed item size */
- if (item_size != sizeof(*fi)) {
+ if (unlikely(item_size != sizeof(*fi))) {
file_extent_err(leaf, slot,
"invalid item size for reg/prealloc file extent, have %u expect %zu",
item_size, sizeof(*fi));
return -EUCLEAN;
}
- if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
+ if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)))
return -EUCLEAN;
/* Catch extent end overflow */
- if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
- key->offset, &extent_end)) {
+ if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
+ key->offset, &extent_end))) {
file_extent_err(leaf, slot,
"extent end overflow, have file offset %llu extent num bytes %llu",
key->offset,
@@ -320,7 +324,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
prev_fi = btrfs_item_ptr(leaf, slot - 1,
struct btrfs_file_extent_item);
prev_end = file_extent_end(leaf, prev_key, prev_fi);
- if (prev_end > key->offset) {
+ if (unlikely(prev_end > key->offset)) {
file_extent_err(leaf, slot - 1,
"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
prev_end, key->offset);
@@ -336,34 +340,34 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u32 sectorsize = fs_info->sectorsize;
- u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csumsize = fs_info->csum_size;
- if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) {
generic_err(leaf, slot,
"invalid key objectid for csum item, have %llu expect %llu",
key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->offset, sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
generic_err(leaf, slot,
"unaligned key offset for csum item, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
- if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
generic_err(leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
- btrfs_item_size_nr(leaf, slot), csumsize);
+ btrfs_item_size(leaf, slot), csumsize);
return -EUCLEAN;
}
if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
u64 prev_csum_end;
u32 prev_item_size;
- prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+ prev_item_size = btrfs_item_size(leaf, slot - 1);
prev_csum_end = (prev_item_size / csumsize) * sectorsize;
prev_csum_end += prev_key->offset;
- if (prev_csum_end > key->offset) {
+ if (unlikely(prev_csum_end > key->offset)) {
generic_err(leaf, slot - 1,
"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
prev_csum_end, key->offset);
@@ -388,15 +392,16 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
/* For XATTR_ITEM, location key should be all 0 */
if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (key->type != 0 || key->objectid != 0 || key->offset != 0)
+ if (unlikely(key->objectid != 0 || key->type != 0 ||
+ key->offset != 0))
return -EUCLEAN;
return 0;
}
- if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
- key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
- key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
- key->objectid != BTRFS_FREE_INO_OBJECTID) {
+ if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID)) {
if (is_inode_item) {
generic_err(leaf, slot,
"invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
@@ -414,7 +419,7 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
return -EUCLEAN;
}
- if (key->offset != 0) {
+ if (unlikely(key->offset != 0)) {
if (is_inode_item)
inode_item_err(leaf, slot,
"invalid key offset: has %llu expect 0",
@@ -438,7 +443,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
/* No such tree id */
- if (key->objectid == 0) {
+ if (unlikely(key->objectid == 0)) {
if (is_root_item)
generic_err(leaf, slot, "invalid root id 0");
else
@@ -448,7 +453,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (!is_fstree(key->objectid) && !is_root_item) {
+ if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -464,7 +469,8 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* So here we only check offset for reloc tree whose key->offset must
* be a valid tree.
*/
- if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+ if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ key->offset == 0)) {
generic_err(leaf, slot, "invalid root id 0 for reloc tree");
return -EUCLEAN;
}
@@ -477,11 +483,12 @@ static int check_dir_item(struct extent_buffer *leaf,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dir_item *di;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
u32 cur = 0;
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
+
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
struct btrfs_key location_key;
@@ -494,7 +501,7 @@ static int check_dir_item(struct extent_buffer *leaf,
int ret;
/* header itself should not cross item boundary */
- if (cur + sizeof(*di) > item_size) {
+ if (unlikely(cur + sizeof(*di) > item_size)) {
dir_item_err(leaf, slot,
"dir item header crosses item boundary, have %zu boundary %u",
cur + sizeof(*di), item_size);
@@ -505,12 +512,12 @@ static int check_dir_item(struct extent_buffer *leaf,
btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
ret = check_root_key(leaf, &location_key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
} else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
location_key.type == 0) {
ret = check_inode_key(leaf, &location_key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
} else {
dir_item_err(leaf, slot,
@@ -522,22 +529,22 @@ static int check_dir_item(struct extent_buffer *leaf,
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
- if (dir_type >= BTRFS_FT_MAX) {
+ if (unlikely(dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
}
- if (key->type == BTRFS_XATTR_ITEM_KEY &&
- dir_type != BTRFS_FT_XATTR) {
+ if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR)) {
dir_item_err(leaf, slot,
"invalid dir item type for XATTR key, have %u expect %u",
dir_type, BTRFS_FT_XATTR);
return -EUCLEAN;
}
- if (dir_type == BTRFS_FT_XATTR &&
- key->type != BTRFS_XATTR_ITEM_KEY) {
+ if (unlikely(dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY)) {
dir_item_err(leaf, slot,
"xattr dir type found for non-XATTR key");
return -EUCLEAN;
@@ -550,13 +557,13 @@ static int check_dir_item(struct extent_buffer *leaf,
/* Name/data length check */
name_len = btrfs_dir_name_len(leaf, di);
data_len = btrfs_dir_data_len(leaf, di);
- if (name_len > max_name_len) {
+ if (unlikely(name_len > max_name_len)) {
dir_item_err(leaf, slot,
"dir item name len too long, have %u max %u",
name_len, max_name_len);
return -EUCLEAN;
}
- if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) {
dir_item_err(leaf, slot,
"dir item name and data len too long, have %u max %u",
name_len + data_len,
@@ -564,7 +571,7 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (data_len && dir_type != BTRFS_FT_XATTR) {
+ if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) {
dir_item_err(leaf, slot,
"dir item with invalid data len, have %u expect 0",
data_len);
@@ -574,7 +581,7 @@ static int check_dir_item(struct extent_buffer *leaf,
total_size = sizeof(*di) + name_len + data_len;
/* header and name/data should not cross item boundary */
- if (cur + total_size > item_size) {
+ if (unlikely(cur + total_size > item_size)) {
dir_item_err(leaf, slot,
"dir item data crosses item boundary, have %u boundary %u",
cur + total_size, item_size);
@@ -592,7 +599,7 @@ static int check_dir_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, namebuf,
(unsigned long)(di + 1), name_len);
name_hash = btrfs_name_hash(namebuf, name_len);
- if (key->offset != name_hash) {
+ if (unlikely(key->offset != name_hash)) {
dir_item_err(leaf, slot,
"name hash mismatch with key, have 0x%016x expect 0x%016llx",
name_hash, key->offset);
@@ -632,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
static int check_block_group_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_block_group_item bgi;
- u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 item_size = btrfs_item_size(leaf, slot);
+ u64 chunk_objectid;
u64 flags;
u64 type;
@@ -641,13 +650,13 @@ static int check_block_group_item(struct extent_buffer *leaf,
* Here we don't really care about alignment since extent allocator can
* handle it. We care more about the size.
*/
- if (key->offset == 0) {
+ if (unlikely(key->offset == 0)) {
block_group_err(leaf, slot,
"invalid block group size 0");
return -EUCLEAN;
}
- if (item_size != sizeof(bgi)) {
+ if (unlikely(item_size != sizeof(bgi))) {
block_group_err(leaf, slot,
"invalid item size, have %u expect %zu",
item_size, sizeof(bgi));
@@ -656,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (btrfs_stack_block_group_chunk_objectid(&bgi) !=
- BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+ chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ /*
+ * We don't init the nr_global_roots until we load the global
+ * roots, so this could be 0 at mount time. If it's 0 we'll
+ * just assume we're fine, and later we'll check against our
+ * actual value.
+ */
+ if (unlikely(fs_info->nr_global_roots &&
+ chunk_objectid >= fs_info->nr_global_roots)) {
+ block_group_err(leaf, slot,
+ "invalid block group global root id, have %llu, needs to be <= %llu",
+ chunk_objectid,
+ fs_info->nr_global_roots);
+ return -EUCLEAN;
+ }
+ } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
btrfs_stack_block_group_chunk_objectid(&bgi),
@@ -665,7 +689,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (btrfs_stack_block_group_used(&bgi) > key->offset) {
+ if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) {
block_group_err(leaf, slot,
"invalid block group used, have %llu expect [0, %llu)",
btrfs_stack_block_group_used(&bgi), key->offset);
@@ -673,7 +697,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
}
flags = btrfs_stack_block_group_flags(&bgi);
- if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+ if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) {
block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
@@ -682,11 +706,11 @@ static int check_block_group_item(struct extent_buffer *leaf,
}
type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (type != BTRFS_BLOCK_GROUP_DATA &&
- type != BTRFS_BLOCK_GROUP_METADATA &&
- type != BTRFS_BLOCK_GROUP_SYSTEM &&
- type != (BTRFS_BLOCK_GROUP_METADATA |
- BTRFS_BLOCK_GROUP_DATA)) {
+ if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA))) {
block_group_err(leaf, slot,
"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
@@ -754,50 +778,75 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u64 length;
+ u64 chunk_end;
u64 stripe_len;
u16 num_stripes;
u16 sub_stripes;
u64 type;
u64 features;
bool mixed = false;
+ int raid_index;
+ int nparity;
+ int ncopies;
length = btrfs_chunk_length(leaf, chunk);
stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
+ raid_index = btrfs_bg_flags_to_raid_index(type);
+ ncopies = btrfs_raid_array[raid_index].ncopies;
+ nparity = btrfs_raid_array[raid_index].nparity;
- if (!num_stripes) {
+ if (unlikely(!num_stripes)) {
chunk_err(leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
- if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+ if (unlikely(num_stripes < ncopies)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes < ncopies, have %u < %d",
+ num_stripes, ncopies);
+ return -EUCLEAN;
+ }
+ if (unlikely(nparity && num_stripes == nparity)) {
+ chunk_err(leaf, chunk, logical,
+ "invalid chunk num_stripes == nparity, have %u == %d",
+ num_stripes, nparity);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
chunk_err(leaf, chunk, logical,
"invalid chunk logical, have %llu should aligned to %u",
logical, fs_info->sectorsize);
return -EUCLEAN;
}
- if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+ if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
chunk_err(leaf, chunk, logical,
"invalid chunk sectorsize, have %u expect %u",
btrfs_chunk_sector_size(leaf, chunk),
fs_info->sectorsize);
return -EUCLEAN;
}
- if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+ if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
chunk_err(leaf, chunk, logical,
"invalid chunk length, have %llu", length);
return -EUCLEAN;
}
- if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+ if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
+ chunk_err(leaf, chunk, logical,
+"invalid chunk logical start and length, have logical start %llu length %llu",
+ logical, length);
+ return -EUCLEAN;
+ }
+ if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
chunk_err(leaf, chunk, logical,
"invalid chunk stripe length: %llu",
stripe_len);
return -EUCLEAN;
}
- if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- type) {
+ if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
chunk_err(leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
@@ -806,22 +855,23 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
- (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
+ if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
chunk_err(leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
return -EUCLEAN;
}
- if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
chunk_err(leaf, chunk, logical,
"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
type, BTRFS_BLOCK_GROUP_TYPE_MASK);
return -EUCLEAN;
}
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
- (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)))) {
chunk_err(leaf, chunk, logical,
"system chunk with data or metadata type: 0x%llx",
type);
@@ -833,20 +883,30 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
mixed = true;
if (!mixed) {
- if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
- (type & BTRFS_BLOCK_GROUP_DATA)) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA))) {
chunk_err(leaf, chunk, logical,
"mixed chunk type in non-mixed mode: 0x%llx", type);
return -EUCLEAN;
}
}
- if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
+ sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_DUP &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
chunk_err(leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -869,10 +929,10 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
{
int num_stripes;
- if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
+ if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(struct btrfs_chunk),
BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
@@ -883,11 +943,11 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
if (num_stripes == 0)
goto out;
- if (btrfs_chunk_item_size(num_stripes) !=
- btrfs_item_size_nr(leaf, slot)) {
+ if (unlikely(btrfs_chunk_item_size(num_stripes) !=
+ btrfs_item_size(leaf, slot))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
btrfs_chunk_item_size(num_stripes));
return -EUCLEAN;
}
@@ -922,15 +982,23 @@ static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_dev_item *ditem;
+ const u32 item_size = btrfs_item_size(leaf, slot);
- if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
+ if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
dev_item_err(leaf, slot,
"invalid objectid: has=%llu expect=%llu",
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
+
+ if (unlikely(item_size != sizeof(*ditem))) {
+ dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*ditem));
+ return -EUCLEAN;
+ }
+
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
- if (btrfs_device_id(leaf, ditem) != key->offset) {
+ if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
dev_item_err(leaf, slot,
"devid mismatch: key has=%llu item has=%llu",
key->offset, btrfs_device_id(leaf, ditem));
@@ -942,8 +1010,8 @@ static int check_dev_item(struct extent_buffer *leaf,
* it can be 0 for device removal. Device size check can only be done
* by dev extents check.
*/
- if (btrfs_device_bytes_used(leaf, ditem) >
- btrfs_device_total_bytes(leaf, ditem)) {
+ if (unlikely(btrfs_device_bytes_used(leaf, ditem) >
+ btrfs_device_total_bytes(leaf, ditem))) {
dev_item_err(leaf, slot,
"invalid bytes used: have %llu expect [0, %llu]",
btrfs_device_bytes_used(leaf, ditem),
@@ -957,10 +1025,6 @@ static int check_dev_item(struct extent_buffer *leaf,
return 0;
}
-/* Inode item error output has the same format as dir_item_err() */
-#define inode_item_err(eb, slot, fmt, ...) \
- dir_item_err(eb, slot, fmt, __VA_ARGS__)
-
static int check_inode_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
@@ -968,17 +1032,26 @@ static int check_inode_item(struct extent_buffer *leaf,
struct btrfs_inode_item *iitem;
u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+ const u32 item_size = btrfs_item_size(leaf, slot);
u32 mode;
int ret;
+ u32 flags;
+ u32 ro_flags;
ret = check_inode_key(leaf, key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
+ if (unlikely(item_size != sizeof(*iitem))) {
+ generic_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*iitem));
+ return -EUCLEAN;
+ }
+
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
- if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
+ if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) {
inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect (0, %llu]",
btrfs_inode_generation(leaf, iitem),
@@ -986,9 +1059,9 @@ static int check_inode_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
- if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
+ if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) {
inode_item_err(leaf, slot,
- "invalid inode generation: has %llu expect [0, %llu]",
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
}
@@ -999,7 +1072,7 @@ static int check_inode_item(struct extent_buffer *leaf,
* anything in the fs. So here we skip the check.
*/
mode = btrfs_inode_mode(leaf, iitem);
- if (mode & ~valid_mask) {
+ if (unlikely(mode & ~valid_mask)) {
inode_item_err(leaf, slot,
"unknown mode bit detected: 0x%x",
mode & ~valid_mask);
@@ -1012,24 +1085,30 @@ static int check_inode_item(struct extent_buffer *leaf,
* FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
*/
if (!has_single_bit_set(mode & S_IFMT)) {
- if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
+ if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) {
inode_item_err(leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
mode & S_IFMT);
return -EUCLEAN;
}
}
- if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
+ if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) {
inode_item_err(leaf, slot,
"invalid nlink: has %u expect no more than 1 for dir",
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
- if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
+ if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
inode_item_err(leaf, slot,
- "unknown flags detected: 0x%llx",
- btrfs_inode_flags(leaf, iitem) &
- ~BTRFS_INODE_FLAG_MASK);
+ "unknown incompat flags detected: 0x%x", flags);
+ return -EUCLEAN;
+ }
+ if (unlikely(!sb_rdonly(fs_info->sb) &&
+ (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
+ inode_item_err(leaf, slot,
+ "unknown ro-compat flags detected on writeable mount: 0x%x",
+ ro_flags);
return -EUCLEAN;
}
return 0;
@@ -1039,43 +1118,52 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_root_item ri;
+ struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
ret = check_root_key(leaf, key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size(leaf, slot) !=
+ btrfs_legacy_root_item_size())) {
generic_err(leaf, slot,
- "invalid root item size, have %u expect %zu",
- btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
+ return -EUCLEAN;
}
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- sizeof(ri));
+ btrfs_item_size(leaf, slot));
/* Generation related */
- if (btrfs_root_generation(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_generation(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root generation, have %llu expect (0, %llu]",
btrfs_root_generation(&ri),
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (btrfs_root_generation_v2(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_generation_v2(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root v2 generation, have %llu expect (0, %llu]",
btrfs_root_generation_v2(&ri),
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (btrfs_root_last_snapshot(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_last_snapshot(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root last_snapshot, have %llu expect (0, %llu]",
btrfs_root_last_snapshot(&ri),
@@ -1084,27 +1172,27 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* Alignment and level check */
- if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid root bytenr, have %llu expect to be aligned to %u",
btrfs_root_bytenr(&ri), fs_info->sectorsize);
return -EUCLEAN;
}
- if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) {
generic_err(leaf, slot,
"invalid root level, have %u expect [0, %u]",
btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
- if (ri.drop_level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) {
generic_err(leaf, slot,
"invalid root level, have %u expect [0, %u]",
- ri.drop_level, BTRFS_MAX_LEVEL - 1);
+ btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
/* Flags check */
- if (btrfs_root_flags(&ri) & ~valid_root_flags) {
+ if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) {
generic_err(leaf, slot,
"invalid root flags, have 0x%llx expect mask 0x%llx",
btrfs_root_flags(&ri), valid_root_flags);
@@ -1145,27 +1233,28 @@ static void extent_err(const struct extent_buffer *eb, int slot,
}
static int check_extent_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
bool is_tree_block = false;
unsigned long ptr; /* Current pointer inside inline refs */
unsigned long end; /* Extent item end */
- const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 item_size = btrfs_item_size(leaf, slot);
u64 flags;
u64 generation;
u64 total_refs; /* Total refs in btrfs_extent_item */
u64 inline_refs = 0; /* found total inline refs */
- if (key->type == BTRFS_METADATA_ITEM_KEY &&
- !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) {
generic_err(leaf, slot,
"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
return -EUCLEAN;
}
/* key->objectid is the bytenr for both key types */
- if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid, have %llu expect to be aligned to %u",
key->objectid, fs_info->sectorsize);
@@ -1173,8 +1262,8 @@ static int check_extent_item(struct extent_buffer *leaf,
}
/* key->offset is tree level for METADATA_ITEM_KEY */
- if (key->type == BTRFS_METADATA_ITEM_KEY &&
- key->offset >= BTRFS_MAX_LEVEL) {
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ key->offset >= BTRFS_MAX_LEVEL)) {
extent_err(leaf, slot,
"invalid tree level, have %llu expect [0, %u]",
key->offset, BTRFS_MAX_LEVEL - 1);
@@ -1200,7 +1289,7 @@ static int check_extent_item(struct extent_buffer *leaf,
* Either using btrfs_extent_inline_ref::offset, or specific
* data structure.
*/
- if (item_size < sizeof(*ei)) {
+ if (unlikely(item_size < sizeof(*ei))) {
extent_err(leaf, slot,
"invalid item size, have %u expect [%zu, %u)",
item_size, sizeof(*ei),
@@ -1214,15 +1303,16 @@ static int check_extent_item(struct extent_buffer *leaf,
flags = btrfs_extent_flags(leaf, ei);
total_refs = btrfs_extent_refs(leaf, ei);
generation = btrfs_extent_generation(leaf, ei);
- if (generation > btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(generation >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
extent_err(leaf, slot,
"invalid generation, have %llu expect (0, %llu]",
generation,
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
- BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+ if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK)))) {
extent_err(leaf, slot,
"invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
flags, BTRFS_EXTENT_FLAG_DATA |
@@ -1231,26 +1321,31 @@ static int check_extent_item(struct extent_buffer *leaf,
}
is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
if (is_tree_block) {
- if (key->type == BTRFS_EXTENT_ITEM_KEY &&
- key->offset != fs_info->nodesize) {
+ if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY &&
+ key->offset != fs_info->nodesize)) {
extent_err(leaf, slot,
"invalid extent length, have %llu expect %u",
key->offset, fs_info->nodesize);
return -EUCLEAN;
}
} else {
- if (key->type != BTRFS_EXTENT_ITEM_KEY) {
+ if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) {
extent_err(leaf, slot,
"invalid key type, have %u expect %u for data backref",
key->type, BTRFS_EXTENT_ITEM_KEY);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent length, have %llu expect aligned to %u",
key->offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ extent_err(leaf, slot,
+ "invalid extent flag, data has full backref set");
+ return -EUCLEAN;
+ }
}
ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
@@ -1259,7 +1354,7 @@ static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)ptr;
- if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) {
extent_err(leaf, slot,
"invalid tree block info level, have %u expect [0, %u]",
btrfs_tree_block_level(leaf, info),
@@ -1278,7 +1373,7 @@ static int check_extent_item(struct extent_buffer *leaf,
u64 inline_offset;
u8 inline_type;
- if (ptr + sizeof(*iref) > end) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
ptr, sizeof(*iref), end);
@@ -1287,7 +1382,7 @@ static int check_extent_item(struct extent_buffer *leaf,
iref = (struct btrfs_extent_inline_ref *)ptr;
inline_type = btrfs_extent_inline_ref_type(leaf, iref);
inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
- if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
+ if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
ptr, inline_type, end);
@@ -1301,7 +1396,8 @@ static int check_extent_item(struct extent_buffer *leaf,
break;
/* Contains parent bytenr */
case BTRFS_SHARED_BLOCK_REF_KEY:
- if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid tree parent bytenr, have %llu expect aligned to %u",
inline_offset, fs_info->sectorsize);
@@ -1316,7 +1412,8 @@ static int check_extent_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(dref_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid data ref offset, have %llu expect aligned to %u",
dref_offset, fs_info->sectorsize);
@@ -1327,7 +1424,8 @@ static int check_extent_item(struct extent_buffer *leaf,
/* Contains parent bytenr and ref count */
case BTRFS_SHARED_DATA_REF_KEY:
sref = (struct btrfs_shared_data_ref *)(iref + 1);
- if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid data parent bytenr, have %llu expect aligned to %u",
inline_offset, fs_info->sectorsize);
@@ -1343,19 +1441,39 @@ static int check_extent_item(struct extent_buffer *leaf,
ptr += btrfs_extent_inline_ref_size(inline_type);
}
/* No padding is allowed */
- if (ptr != end) {
+ if (unlikely(ptr != end)) {
extent_err(leaf, slot,
"invalid extent item size, padding bytes found");
return -EUCLEAN;
}
/* Finally, check the inline refs against total refs */
- if (inline_refs > total_refs) {
+ if (unlikely(inline_refs > total_refs)) {
extent_err(leaf, slot,
"invalid extent refs, have %llu expect >= inline %llu",
total_refs, inline_refs);
return -EUCLEAN;
}
+
+ if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
+ (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
+ u64 prev_end = prev_key->objectid;
+
+ if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
+ prev_end += fs_info->nodesize;
+ else
+ prev_end += prev_key->offset;
+
+ if (unlikely(prev_end > key->objectid)) {
+ extent_err(leaf, slot,
+ "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
+ prev_key->objectid, prev_key->type,
+ prev_key->offset, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+ }
+
return 0;
}
@@ -1367,21 +1485,21 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
if (key->type == BTRFS_SHARED_DATA_REF_KEY)
expect_item_size = sizeof(struct btrfs_shared_data_ref);
- if (btrfs_item_size_nr(leaf, slot) != expect_item_size) {
+ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
"invalid item size, have %u expect %u for key type %u",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
expect_item_size, key->type);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
key->objectid, leaf->fs_info->sectorsize);
return -EUCLEAN;
}
- if (key->type != BTRFS_TREE_BLOCK_REF_KEY &&
- !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) {
+ if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY &&
+ !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid tree parent bytenr, have %llu expect aligned to %u",
key->offset, leaf->fs_info->sectorsize);
@@ -1395,41 +1513,35 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
{
struct btrfs_extent_data_ref *dref;
unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
- const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
+ const unsigned long end = ptr + btrfs_item_size(leaf, slot);
- if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) {
+ if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
generic_err(leaf, slot,
"invalid item size, have %u expect aligned to %zu for key type %u",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(*dref), key->type);
+ return -EUCLEAN;
}
- if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
key->objectid, leaf->fs_info->sectorsize);
return -EUCLEAN;
}
for (; ptr < end; ptr += sizeof(*dref)) {
- u64 root_objectid;
- u64 owner;
u64 offset;
- u64 hash;
+ /*
+ * We cannot check the extent_data_ref hash due to possible
+ * overflow from the leaf due to hash collisions.
+ */
dref = (struct btrfs_extent_data_ref *)ptr;
- root_objectid = btrfs_extent_data_ref_root(leaf, dref);
- owner = btrfs_extent_data_ref_objectid(leaf, dref);
offset = btrfs_extent_data_ref_offset(leaf, dref);
- hash = hash_extent_data_ref(root_objectid, owner, offset);
- if (hash != key->offset) {
- extent_err(leaf, slot,
- "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
- hash, key->offset);
- return -EUCLEAN;
- }
- if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
}
}
return 0;
@@ -1445,23 +1557,23 @@ static int check_inode_ref(struct extent_buffer *leaf,
unsigned long ptr;
unsigned long end;
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
- if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
+ if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
- btrfs_item_size_nr(leaf, slot),
+ btrfs_item_size(leaf, slot),
sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
}
ptr = btrfs_item_ptr_offset(leaf, slot);
- end = ptr + btrfs_item_size_nr(leaf, slot);
+ end = ptr + btrfs_item_size(leaf, slot);
while (ptr < end) {
u16 namelen;
- if (ptr + sizeof(iref) > end) {
+ if (unlikely(ptr + sizeof(iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
ptr, end, sizeof(iref));
@@ -1470,7 +1582,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
iref = (struct btrfs_inode_ref *)ptr;
namelen = btrfs_inode_ref_name_len(leaf, iref);
- if (ptr + sizeof(*iref) + namelen > end) {
+ if (unlikely(ptr + sizeof(*iref) + namelen > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu namelen %u",
ptr, end, namelen);
@@ -1530,7 +1642,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_EXTENT_ITEM_KEY:
case BTRFS_METADATA_ITEM_KEY:
- ret = check_extent_item(leaf, key, slot);
+ ret = check_extent_item(leaf, key, slot, prev_key);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_DATA_REF_KEY:
@@ -1553,7 +1665,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
u32 nritems = btrfs_header_nritems(leaf);
int slot;
- if (btrfs_header_level(leaf) != 0) {
+ if (unlikely(btrfs_header_level(leaf) != 0)) {
generic_err(leaf, 0,
"invalid level for leaf, have %d expect 0",
btrfs_header_level(leaf));
@@ -1572,27 +1684,39 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
u64 owner = btrfs_header_owner(leaf);
/* These trees must never be empty */
- if (owner == BTRFS_ROOT_TREE_OBJECTID ||
- owner == BTRFS_CHUNK_TREE_OBJECTID ||
- owner == BTRFS_EXTENT_TREE_OBJECTID ||
- owner == BTRFS_DEV_TREE_OBJECTID ||
- owner == BTRFS_FS_TREE_OBJECTID ||
- owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
generic_err(leaf, 0,
"invalid root, root %llu must never be empty",
owner);
return -EUCLEAN;
}
+
/* Unknown tree */
- if (owner == 0) {
+ if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
return -EUCLEAN;
}
+
+ /* EXTENT_TREE_V2 can have empty extent trees. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
+ generic_err(leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+
return 0;
}
- if (nritems == 0)
+ if (unlikely(nritems == 0))
return 0;
/*
@@ -1608,12 +1732,13 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
*/
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
+ u64 item_data_end;
int ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
/* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
generic_err(leaf, slot,
"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
prev_key.objectid, prev_key.type,
@@ -1622,6 +1747,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return -EUCLEAN;
}
+ item_data_end = (u64)btrfs_item_offset(leaf, slot) +
+ btrfs_item_size(leaf, slot);
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
@@ -1630,13 +1757,12 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
if (slot == 0)
item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
else
- item_end_expected = btrfs_item_offset_nr(leaf,
+ item_end_expected = btrfs_item_offset(leaf,
slot - 1);
- if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ if (unlikely(item_data_end != item_end_expected)) {
generic_err(leaf, slot,
- "unexpected item end, have %u expect %u",
- btrfs_item_end_nr(leaf, slot),
- item_end_expected);
+ "unexpected item end, have %llu expect %u",
+ item_data_end, item_end_expected);
return -EUCLEAN;
}
@@ -1645,18 +1771,16 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
- "slot end outside of leaf, have %u expect range [0, %u]",
- btrfs_item_end_nr(leaf, slot),
- BTRFS_LEAF_DATA_SIZE(fs_info));
+ "slot end outside of leaf, have %llu expect range [0, %u]",
+ item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
/* Also check if the item pointer overlaps with btrfs item. */
- if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
- btrfs_item_ptr_offset(leaf, slot)) {
+ if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
+ btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
@@ -1671,7 +1795,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* criteria
*/
ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
}
@@ -1704,13 +1828,13 @@ int btrfs_check_node(struct extent_buffer *node)
u64 bytenr;
int ret = 0;
- if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
level, BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+ if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
btrfs_header_owner(node), node->start,
@@ -1724,13 +1848,13 @@ int btrfs_check_node(struct extent_buffer *node)
btrfs_node_key_to_cpu(node, &key, slot);
btrfs_node_key_to_cpu(node, &next_key, slot + 1);
- if (!bytenr) {
+ if (unlikely(!bytenr)) {
generic_err(node, slot,
"invalid NULL node pointer");
ret = -EUCLEAN;
goto out;
}
- if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) {
generic_err(node, slot,
"unaligned pointer, have %llu should be aligned to %u",
bytenr, fs_info->sectorsize);
@@ -1738,7 +1862,7 @@ int btrfs_check_node(struct extent_buffer *node)
goto out;
}
- if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
generic_err(node, slot,
"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
key.objectid, key.type, key.offset,
@@ -1752,3 +1876,58 @@ out:
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
+
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
+{
+ const bool is_subvol = is_fstree(root_owner);
+ const u64 eb_owner = btrfs_header_owner(eb);
+
+ /*
+ * Skip dummy fs, as selftests don't create unique ebs for each dummy
+ * root.
+ */
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ return 0;
+ /*
+ * There are several call sites (backref walking, qgroup, and data
+ * reloc) passing 0 as @root_owner, as they are not holding the
+ * tree root. In that case, we can not do a reliable ownership check,
+ * so just exit.
+ */
+ if (root_owner == 0)
+ return 0;
+ /*
+ * These trees use key.offset as their owner, our callers don't have
+ * the extra capacity to pass key.offset here. So we just skip them.
+ */
+ if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
+ root_owner == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!is_subvol) {
+ /* For non-subvolume trees, the eb owner should match root owner */
+ if (unlikely(root_owner != eb_owner)) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ root_owner);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /*
+ * For subvolume trees, owners can mismatch, but they should all belong
+ * to subvolume trees.
+ */
+ if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ return 0;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 32fecc9dc1dd..ece497e26558 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5f9e2dd413af..b6cf39f4e7e4 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -27,15 +27,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int next_key_ret = 0;
u64 last_ret = 0;
- if (root->fs_info->extent_root == root) {
- /*
- * there's recursion here right now in the tree locking,
- * we can't defrag the extent root without deadlock
- */
- goto out;
- }
-
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
goto out;
path = btrfs_alloc_path();
@@ -52,7 +44,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
@@ -133,10 +124,9 @@ out:
ret = 0;
}
done:
- if (ret != -EAGAIN) {
+ if (ret != -EAGAIN)
memset(&root->defrag_progress, 0,
sizeof(root->defrag_progress));
- root->defrag_trans_start = trans->transid;
- }
+
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7dd7552f53a4..813986e38258 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,7 +17,12 @@
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
-#include "inode-map.h"
+#include "block-group.h"
+#include "space-info.h"
+#include "zoned.h"
+#include "inode-item.h"
+
+#define MAX_CONFLICT_INODES 10
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -28,8 +33,6 @@
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
- LOG_OTHER_INODE,
- LOG_OTHER_INODE_ALL,
};
/*
@@ -92,10 +95,8 @@ enum {
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -105,6 +106,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
@@ -139,16 +141,45 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
+ bool created = false;
+
+ /*
+ * First check if the log root tree was already created. If not, create
+ * it before locking the root's log_mutex, just to keep lockdep happy.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
+ mutex_lock(&tree_root->log_mutex);
+ if (!fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ if (!ret) {
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ created = true;
+ }
+ }
+ mutex_unlock(&tree_root->log_mutex);
+ if (ret)
+ return ret;
+ }
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
@@ -156,24 +187,28 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
- mutex_lock(&fs_info->tree_log_mutex);
- if (!fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, fs_info);
- mutex_unlock(&fs_info->tree_log_mutex);
- if (ret)
+ /*
+ * This means fs_info->log_root_tree was already created
+ * for some other FS trees. Do the full commit not to mix
+ * nodes from multiple log transactions to do sequential
+ * writing.
+ */
+ if (zoned && !created) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
+ }
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
- atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (!ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -191,11 +226,22 @@ out:
*/
static int join_running_log_trans(struct btrfs_root *root)
{
+ const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
ret = 0;
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
@@ -209,9 +255,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -226,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
-static int btrfs_write_tree_block(struct extent_buffer *buf)
-{
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
- buf->start + buf->len - 1);
-}
-
static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
filemap_fdatawait_range(buf->pages[0]->mapping,
@@ -250,16 +288,6 @@ struct walk_control {
*/
int free;
- /* should we write out the extent buffer? This is used
- * while flushing the log tree to disk during a sync
- */
- int write;
-
- /* should we wait for the extent buffer io to finish? Also used
- * while flushing the log tree to disk for a sync
- */
- int wait;
-
/* pin only walk, we record which extents on disk belong to the
* log trees
*/
@@ -305,45 +333,29 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
- if (wc->pin)
- ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
+ if (wc->pin) {
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
+ if (ret)
+ return ret;
- if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
- if (wc->pin && btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, 0) &&
+ btrfs_header_level(eb) == 0)
ret = btrfs_exclude_logged_extents(eb);
- if (wc->write)
- btrfs_write_tree_block(eb);
- if (wc->wait)
- btrfs_wait_tree_block_writeback(eb);
}
return ret;
}
-/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten. If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct extent_buffer *eb, int slot,
- struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
{
int ret;
u32 item_size;
@@ -357,18 +369,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
overwrite_root = 1;
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
- /* look for the key in the destination tree */
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
+ /* Our caller must have done a search for the key for us. */
+ ASSERT(path->nodes[0] != NULL);
+
+ /*
+ * And the slot must point to the exact key or the slot where the key
+ * should be at (the first item with a key greater than 'key')
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key found_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+ ret = btrfs_comp_cpu_keys(&found_key, key);
+ ASSERT(ret >= 0);
+ } else {
+ ret = 1;
+ }
if (ret == 0) {
char *src_copy;
char *dst_copy;
- u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+ u32 dst_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (dst_size != item_size)
goto insert;
@@ -462,7 +486,7 @@ insert:
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
- found_size = btrfs_item_size_nr(path->nodes[0],
+ found_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
btrfs_truncate_item(path, item_size, 1);
@@ -503,13 +527,8 @@ insert:
*/
if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
- ino_size != 0) {
- struct btrfs_map_token token;
-
- btrfs_init_map_token(&token, dst_eb);
- btrfs_set_token_inode_size(dst_eb, dst_item,
- ino_size, &token);
- }
+ ino_size != 0)
+ btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
@@ -547,19 +566,45 @@ no_copy:
}
/*
+ * Item overwrite used by replay and tree logging. eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten. If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *eb, int slot,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ /* Look for the key in the destination tree. */
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
static noinline struct inode *read_one_inode(struct btrfs_root *root,
u64 objectid)
{
- struct btrfs_key key;
struct inode *inode;
- key.objectid = objectid;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root);
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -583,6 +628,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
@@ -660,7 +706,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+ drop_args.start = start;
+ drop_args.end = extent_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
if (ret)
goto out;
@@ -715,13 +764,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
@@ -750,7 +801,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0);
+ &ordered_sums, 0, false);
if (ret)
goto out;
/*
@@ -804,17 +855,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
+ struct btrfs_root *csum_root;
+
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
+ csum_root = btrfs_csum_root(fs_info,
+ sums->bytenr);
if (!ret)
- ret = btrfs_del_csums(trans,
- fs_info->csum_root,
+ ret = btrfs_del_csums(trans, csum_root,
sums->bytenr,
sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
- fs_info->csum_root, sums);
+ csum_root,
+ sums);
list_del(&sums->list);
kfree(sums);
}
@@ -830,15 +885,39 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
- inode_add_bytes(inode, nbytes);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
+ extent_end - start);
+ if (ret)
+ goto out;
+
update_inode:
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
- if (inode)
- iput(inode);
+ iput(inode);
return ret;
}
+static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const char *name,
+ int name_len)
+{
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ if (ret)
+ return ret;
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ return btrfs_run_delayed_items(trans);
+}
+
/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
@@ -848,11 +927,11 @@ out:
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
+ struct btrfs_root *root = dir->root;
struct inode *inode;
char *name;
int name_len;
@@ -881,12 +960,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
name_len);
- if (ret)
- goto out;
- else
- ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -894,9 +969,11 @@ out:
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -905,29 +982,34 @@ static noinline int inode_in_dir(struct btrfs_root *root,
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -981,8 +1063,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen,
- int *search_done)
+ u64 ref_index, char *name, int namelen)
{
int ret;
char *victim_name;
@@ -1016,7 +1097,7 @@ again:
* otherwise they must be unlinked as a conflict
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
while (ptr < ptr_end) {
victim_ref = (struct btrfs_inode_ref *)ptr;
victim_name_len = btrfs_inode_ref_name_len(leaf,
@@ -1039,27 +1120,17 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root, dir, inode,
+ ret = unlink_inode_for_log_replay(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- return ret;
- *search_done = 1;
goto again;
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
-
- /*
- * NOTE: we have searched root tree and checked the
- * corresponding ref, it does not need to check again.
- */
- *search_done = 1;
}
btrfs_release_path(path);
@@ -1067,7 +1138,9 @@ again:
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -1075,7 +1148,7 @@ again:
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
@@ -1101,6 +1174,7 @@ again:
parent_objectid, victim_name,
victim_name_len);
if (ret < 0) {
+ kfree(victim_name);
return ret;
} else if (!ret) {
ret = -ENOENT;
@@ -1110,35 +1184,32 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, root,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
victim_name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(
- trans);
}
iput(victim_parent);
kfree(victim_name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
kfree(victim_name);
next:
cur_offset += victim_name_len + sizeof(*extref);
}
- *search_done = 1;
}
btrfs_release_path(path);
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1147,8 +1218,10 @@ next:
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
+ ret = drop_one_dir_item(trans, path, dir, di);
if (ret)
return ret;
}
@@ -1233,7 +1306,7 @@ again:
eb = path->nodes[0];
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
+ ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
while (ref_ptr < ref_end) {
char *name = NULL;
int namelen;
@@ -1268,7 +1341,7 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
@@ -1290,106 +1363,6 @@ again:
return ret;
}
-static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
- const u8 ref_type, const char *name,
- const int namelen)
-{
- struct btrfs_key key;
- struct btrfs_path *path;
- const u64 parent_id = btrfs_ino(BTRFS_I(dir));
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = ref_type;
- if (key.type == BTRFS_INODE_REF_KEY)
- key.offset = parent_id;
- else
- key.offset = btrfs_extref_hash(parent_id, name, namelen);
-
- ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id, name, namelen);
- else
- ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen);
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct inode *dir, struct inode *inode, const char *name,
- int namelen, u64 ref_index)
-{
- struct btrfs_dir_item *dir_item;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct inode *other_inode = NULL;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_item = btrfs_lookup_dir_item(NULL, root, path,
- btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
- if (!dir_item) {
- btrfs_release_path(path);
- goto add_link;
- } else if (IS_ERR(dir_item)) {
- ret = PTR_ERR(dir_item);
- goto out;
- }
-
- /*
- * Our inode's dentry collides with the dentry of another inode which is
- * in the log but not yet processed since it has a higher inode number.
- * So delete that other dentry.
- */
- btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
- btrfs_release_path(path);
- other_inode = read_one_inode(root, key.objectid);
- if (!other_inode) {
- ret = -ENOENT;
- goto out;
- }
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
- if (ret)
- goto out;
- /*
- * If we dropped the link count to 0, bump it so that later the iput()
- * on the inode will not free it. We will fixup the link count later.
- */
- if (other_inode->i_nlink == 0)
- inc_nlink(other_inode);
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
-add_link:
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, namelen, 0, ref_index);
-out:
- iput(other_inode);
- btrfs_free_path(path);
-
- return ret;
-}
-
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1410,7 +1383,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
char *name = NULL;
int namelen;
int ret;
- int search_done = 0;
int log_ref_ver = 0;
u64 parent_objectid;
u64 inode_objectid;
@@ -1418,7 +1390,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
int ref_struct_size;
ref_ptr = btrfs_item_ptr_offset(eb, slot);
- ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+ ref_end = ref_ptr + btrfs_item_size(eb, slot);
if (key->type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *r;
@@ -1472,10 +1444,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1483,56 +1457,27 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
-
- if (!search_done) {
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir),
- BTRFS_I(inode),
- inode_objectid,
- parent_objectid,
- ref_index, name, namelen,
- &search_done);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out;
- }
- }
-
- /*
- * If a reference item already exists for this inode
- * with the same parent and name, but different index,
- * drop it and the corresponding directory index entries
- * from the parent before adding the new reference item
- * and dir index entries, otherwise we would fail with
- * -EEXIST returned from btrfs_add_link() below.
- */
- ret = btrfs_inode_ref_exists(inode, dir, key->type,
- name, namelen);
- if (ret > 0) {
- ret = btrfs_unlink_inode(trans, root,
- BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen);
- /*
- * If we dropped the link count to 0, bump it so
- * that later the iput() on the inode will not
- * free it. We will fixup the link count later.
- */
- if (!ret && inode->i_nlink == 0)
- inc_nlink(inode);
- }
- if (ret < 0)
+ ret = __add_inode_ref(trans, root, path, log,
+ BTRFS_I(dir), BTRFS_I(inode),
+ inode_objectid, parent_objectid,
+ ref_index, name, namelen);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out;
+ }
/* insert our name */
- ret = add_link(trans, root, dir, inode, name, namelen,
- ref_index);
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
if (ret)
goto out;
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1566,18 +1511,6 @@ out:
return ret;
}
-static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 ino)
-{
- int ret;
-
- ret = btrfs_insert_orphan_item(trans, root, ino);
- if (ret == -EEXIST)
- ret = 0;
-
- return ret;
-}
-
static int count_inode_extrefs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
@@ -1599,7 +1532,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
break;
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
@@ -1653,7 +1586,7 @@ process_slot:
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+ ptr_end = ptr + btrfs_item_size(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
@@ -1718,7 +1651,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1729,7 +1664,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- ret = insert_orphan_item(trans, root, ino);
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
}
out:
@@ -1754,6 +1691,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1766,17 +1704,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1785,8 +1725,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -1822,11 +1760,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
else
inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
- } else {
- BUG(); /* Logic Error */
}
iput(inode);
@@ -1868,6 +1804,34 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
return ret;
}
+static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_path *path,
+ struct btrfs_dir_item *dst_di,
+ const struct btrfs_key *log_key,
+ u8 log_type,
+ bool exists)
+{
+ struct btrfs_key found_key;
+
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+ /* The existing dentry points to the same inode, don't delete it. */
+ if (found_key.objectid == log_key->objectid &&
+ found_key.type == log_key->type &&
+ found_key.offset == log_key->offset &&
+ btrfs_dir_type(path->nodes[0], dst_di) == log_type)
+ return 1;
+
+ /*
+ * Don't drop the conflicting directory entry if the inode for the new
+ * entry doesn't exist.
+ */
+ if (!exists)
+ return 0;
+
+ return drop_one_dir_item(trans, path, dir, dst_di);
+}
+
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
@@ -1893,14 +1857,17 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
{
char *name;
int name_len;
- struct btrfs_dir_item *dst_di;
- struct btrfs_key found_key;
+ struct btrfs_dir_item *dir_dst_di;
+ struct btrfs_dir_item *index_dst_di;
+ bool dir_dst_matches = false;
+ bool index_dst_matches = false;
struct btrfs_key log_key;
+ struct btrfs_key search_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
- bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ bool exists;
+ int ret;
+ bool update_size = true;
bool name_added = false;
dir = read_one_inode(root, key->objectid);
@@ -1919,79 +1886,60 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
- if (key->type == BTRFS_DIR_ITEM_KEY) {
- dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
- name, name_len, 1);
- } else if (key->type == BTRFS_DIR_INDEX_KEY) {
- dst_di = btrfs_lookup_dir_index_item(trans, root, path,
- key->objectid,
- key->offset, name,
- name_len, 1);
- } else {
- /* Corruption */
- ret = -EINVAL;
+ dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+ name, name_len, 1);
+ if (IS_ERR(dir_dst_di)) {
+ ret = PTR_ERR(dir_dst_di);
goto out;
- }
- if (IS_ERR_OR_NULL(dst_di)) {
- /* we need a sequence number to insert, so we only
- * do inserts for the BTRFS_DIR_INDEX_KEY types
- */
- if (key->type != BTRFS_DIR_INDEX_KEY)
+ } else if (dir_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ dir_dst_di, &log_key, log_type,
+ exists);
+ if (ret < 0)
goto out;
- goto insert;
+ dir_dst_matches = (ret == 1);
}
- btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
- /* the existing item matches the logged item */
- if (found_key.objectid == log_key.objectid &&
- found_key.type == log_key.type &&
- found_key.offset == log_key.offset &&
- btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
- update_size = false;
+ btrfs_release_path(path);
+
+ index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+ key->objectid, key->offset,
+ name, name_len, 1);
+ if (IS_ERR(index_dst_di)) {
+ ret = PTR_ERR(index_dst_di);
goto out;
+ } else if (index_dst_di) {
+ ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
+ index_dst_di, &log_key,
+ log_type, exists);
+ if (ret < 0)
+ goto out;
+ index_dst_matches = (ret == 1);
}
- /*
- * don't drop the conflicting directory entry if the inode
- * for the new entry doesn't exist
- */
- if (!exists)
- goto out;
+ btrfs_release_path(path);
- ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
- if (ret)
+ if (dir_dst_matches && index_dst_matches) {
+ ret = 0;
+ update_size = false;
goto out;
-
- if (key->type == BTRFS_DIR_INDEX_KEY)
- goto insert;
-out:
- btrfs_release_path(path);
- if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
- ret = btrfs_update_inode(trans, root, dir);
}
- kfree(name);
- iput(dir);
- if (!ret && name_added)
- ret = 1;
- return ret;
-insert:
/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_REF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -2001,10 +1949,10 @@ insert:
goto out;
}
- found_key.objectid = log_key.objectid;
- found_key.type = BTRFS_INODE_EXTREF_KEY;
- found_key.offset = key->objectid;
- ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+ search_key.objectid = log_key.objectid;
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
name_len);
if (ret < 0) {
goto out;
@@ -2023,87 +1971,76 @@ insert:
name_added = true;
update_size = false;
ret = 0;
- goto out;
+
+out:
+ if (!ret && update_size) {
+ btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ }
+ kfree(name);
+ iput(dir);
+ if (!ret && name_added)
+ ret = 1;
+ return ret;
}
-/*
- * find all the names in a directory item and reconcile them into
- * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
- * one name in a directory item, but the same code gets used for
- * both directory index types
- */
+/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- int ret = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ int ret;
struct btrfs_dir_item *di;
- int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
- struct btrfs_path *fixup_path = NULL;
-
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret < 0)
- break;
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
- /*
- * If this entry refers to a non-directory (directories can not
- * have a link count > 1) and it was added in the transaction
- * that was not committed, make sure we fixup the link count of
- * the inode it the entry points to. Otherwise something like
- * the following would result in a directory pointing to an
- * inode with a wrong link that does not account for this dir
- * entry:
- *
- * mkdir testdir
- * touch testdir/foo
- * touch testdir/bar
- * sync
- *
- * ln testdir/bar testdir/bar_link
- * ln testdir/foo testdir/foo_link
- * xfs_io -c "fsync" testdir/bar
- *
- * <power failure>
- *
- * mount fs, log replay happens
- *
- * File foo would remain with a link count of 1 when it has two
- * entries pointing to it in the directory testdir. This would
- * make it impossible to ever delete the parent directory has
- * it would result in stale dentries that can never be deleted.
- */
- if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
- struct btrfs_key di_key;
+ /* We only log dir index keys, which only contain a single dir item. */
+ ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
- if (!fixup_path) {
- fixup_path = btrfs_alloc_path();
- if (!fixup_path) {
- ret = -ENOMEM;
- break;
- }
- }
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ ret = replay_one_name(trans, root, path, eb, di, key);
+ if (ret < 0)
+ return ret;
- btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- ret = link_to_fixup_dir(trans, root, fixup_path,
- di_key.objectid);
- if (ret)
- break;
- }
- ret = 0;
+ /*
+ * If this entry refers to a non-directory (directories can not have a
+ * link count > 1) and it was added in the transaction that was not
+ * committed, make sure we fixup the link count of the inode the entry
+ * points to. Otherwise something like the following would result in a
+ * directory pointing to an inode with a wrong link that does not account
+ * for this dir entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two entries
+ * pointing to it in the directory testdir. This would make it impossible
+ * to ever delete the parent directory has it would result in stale
+ * dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_path *fixup_path;
+ struct btrfs_key di_key;
+
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path)
+ return -ENOMEM;
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
+ btrfs_free_path(fixup_path);
}
- btrfs_free_path(fixup_path);
+
return ret;
}
@@ -2120,7 +2057,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
- u64 dirid, int key_type,
+ u64 dirid,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
@@ -2133,7 +2070,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
return 1;
key.objectid = dirid;
- key.type = key_type;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2147,7 +2084,7 @@ static noinline int find_dir_range(struct btrfs_root *root,
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto next;
}
@@ -2174,7 +2111,7 @@ next:
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.type != key_type || key.objectid != dirid) {
+ if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
ret = 1;
goto out;
}
@@ -2195,105 +2132,85 @@ out:
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
int ret;
struct extent_buffer *eb;
int slot;
- u32 item_size;
struct btrfs_dir_item *di;
- struct btrfs_dir_item *log_di;
int name_len;
- unsigned long ptr;
- unsigned long ptr_end;
char *name;
- struct inode *inode;
+ struct inode *inode = NULL;
struct btrfs_key location;
-again:
+ /*
+ * Currently we only log dir index keys. Even if we replay a log created
+ * by an older kernel that logged both dir index and dir item keys, all
+ * we need to do is process the dir index keys, we (and our caller) can
+ * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
+ */
+ ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
+
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
- ptr = btrfs_item_ptr_offset(eb, slot);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- di = (struct btrfs_dir_item *)ptr;
- name_len = btrfs_dir_name_len(eb, di);
- name = kmalloc(name_len, GFP_NOFS);
- if (!name) {
- ret = -ENOMEM;
- goto out;
- }
- read_extent_buffer(eb, name, (unsigned long)(di + 1),
- name_len);
- log_di = NULL;
- if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
- log_di = btrfs_lookup_dir_item(trans, log, log_path,
- dir_key->objectid,
- name, name_len, 0);
- } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
- log_di = btrfs_lookup_dir_index_item(trans, log,
- log_path,
- dir_key->objectid,
- dir_key->offset,
- name, name_len, 0);
- }
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
- btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(path);
- btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- kfree(name);
- return -EIO;
- }
+ di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ name_len = btrfs_dir_name_len(eb, di);
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ret = link_to_fixup_dir(trans, root,
- path, location.objectid);
- if (ret) {
- kfree(name);
- iput(inode);
- goto out;
- }
+ read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
- inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
- BTRFS_I(inode), name, name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(trans);
- kfree(name);
- iput(inode);
- if (ret)
- goto out;
+ if (log) {
+ struct btrfs_dir_item *log_di;
- /* there might still be more names under this key
- * check and repeat if required
- */
- ret = btrfs_search_slot(NULL, root, dir_key, path,
- 0, 0);
- if (ret == 0)
- goto again;
+ log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
+ dir_key->objectid,
+ dir_key->offset,
+ name, name_len, 0);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ } else if (log_di) {
+ /* The dentry exists in the log, we have nothing to do. */
ret = 0;
goto out;
- } else if (IS_ERR(log_di)) {
- kfree(name);
- return PTR_ERR(log_di);
}
- btrfs_release_path(log_path);
- kfree(name);
+ }
- ptr = (unsigned long)(di + 1);
- ptr += name_len;
+ btrfs_dir_item_key_to_cpu(eb, di, &location);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
+ inode = read_one_inode(root, location.objectid);
+ if (!inode) {
+ ret = -EIO;
+ goto out;
}
- ret = 0;
+
+ ret = link_to_fixup_dir(trans, root, path, location.objectid);
+ if (ret)
+ goto out;
+
+ inc_nlink(inode);
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, name_len);
+ /*
+ * Unlike dir item keys, dir index keys can only have one name (entry) in
+ * them, as there are no key collisions since each key has a unique offset
+ * (an index number), so we're done.
+ */
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
+ kfree(name);
+ iput(inode);
return ret;
}
@@ -2336,7 +2253,7 @@ process_leaf:
}
di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
- total_size = btrfs_item_size_nr(path->nodes[0], i);
+ total_size = btrfs_item_size(path->nodes[0], i);
cur = 0;
while (cur < total_size) {
u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
@@ -2413,7 +2330,6 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
{
u64 range_start;
u64 range_end;
- int key_type = BTRFS_DIR_LOG_ITEM_KEY;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
@@ -2421,7 +2337,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct inode *dir;
dir_key.objectid = dirid;
- dir_key.type = BTRFS_DIR_ITEM_KEY;
+ dir_key.type = BTRFS_DIR_INDEX_KEY;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
@@ -2435,16 +2351,18 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
btrfs_free_path(log_path);
return 0;
}
-again:
+
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
- ret = find_dir_range(log, path, dirid, key_type,
+ ret = find_dir_range(log, path, dirid,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -2467,13 +2385,15 @@ again:
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != dirid ||
- found_key.type != dir_key.type)
- goto next_type;
+ found_key.type != dir_key.type) {
+ ret = 0;
+ goto out;
+ }
if (found_key.offset > range_end)
break;
- ret = check_item_in_log(trans, root, log, path,
+ ret = check_item_in_log(trans, log, path,
log_path, dir,
&found_key);
if (ret)
@@ -2487,15 +2407,7 @@ again:
break;
range_start = range_end + 1;
}
-
-next_type:
ret = 0;
- if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
- key_type = BTRFS_DIR_LOG_INDEX_KEY;
- dir_key.type = BTRFS_DIR_INDEX_KEY;
- btrfs_release_path(path);
- goto again;
- }
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
@@ -2524,7 +2436,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2588,6 +2500,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct inode *inode;
u64 from;
@@ -2598,12 +2511,18 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
}
from = ALIGN(i_size_read(inode),
root->fs_info->sectorsize);
- ret = btrfs_drop_extents(wc->trans, root, inode,
- from, (u64)-1, 1);
+ drop_args.start = from;
+ drop_args.end = (u64)-1;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(wc->trans, root,
+ BTRFS_I(inode),
+ &drop_args);
if (!ret) {
+ inode_sub_bytes(inode,
+ drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
- root, inode);
+ root, BTRFS_I(inode));
}
iput(inode);
if (ret)
@@ -2648,29 +2567,51 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
eb, i, &key);
if (ret)
break;
- } else if (key.type == BTRFS_DIR_ITEM_KEY) {
- ret = replay_one_dir_item(wc->trans, root, path,
- eb, i, &key);
- if (ret)
- break;
}
+ /*
+ * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
+ * BTRFS_DIR_INDEX_KEY items which we use to derive the
+ * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
+ * older kernel with such keys, ignore them.
+ */
}
btrfs_free_path(path);
return ret;
}
+/*
+ * Correctly adjust the reserved bytes occupied by a log tree extent buffer
+ */
+static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ if (!cache) {
+ btrfs_err(fs_info, "unable to find block group for %llu", start);
+ return;
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->reserved -= fs_info->nodesize;
+ cache->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+}
+
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 root_owner;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
- struct extent_buffer *parent;
u32 blocksize;
int ret = 0;
@@ -2690,10 +2631,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
-
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ btrfs_header_owner(cur),
+ *level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -2707,7 +2647,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen,
+ ret = btrfs_read_extent_buffer(next, ptr_gen,
*level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
@@ -2716,28 +2656,27 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ bytenr, blocksize);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+ btrfs_redirty_list_add(
+ trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
- }
-
- WARN_ON(root_owner !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_pin_reserved_extent(fs_info,
- bytenr, blocksize);
- if (ret) {
- free_extent_buffer(next);
- return ret;
+ unaccount_log_buffer(fs_info, bytenr);
}
}
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2762,7 +2701,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 root_owner;
int i;
int slot;
int ret;
@@ -2775,13 +2713,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- struct extent_buffer *parent;
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
@@ -2795,21 +2726,23 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ if (ret)
+ return ret;
+ btrfs_redirty_list_add(trans->transaction,
+ next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
- }
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_pin_reserved_extent(fs_info,
- path->nodes[*level]->start,
- path->nodes[*level]->len);
- if (ret)
- return ret;
+ unaccount_log_buffer(fs_info,
+ path->nodes[*level]->start);
+ }
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@@ -2876,19 +2809,19 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ next->start, next->len);
+ if (ret)
+ goto out;
+ btrfs_redirty_list_add(trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
+ unaccount_log_buffer(fs_info, next->start);
}
-
- ret = btrfs_pin_reserved_extent(fs_info, next->start,
- next->len);
- if (ret)
- goto out;
}
}
@@ -2964,9 +2897,6 @@ static void wait_for_writer(struct btrfs_root *root)
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- if (!ctx)
- return;
-
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
@@ -2986,8 +2916,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
list_del_init(&ctx->list);
ctx->log_ret = error;
}
-
- INIT_LIST_HEAD(&root->log_ctxs[index]);
}
/*
@@ -3016,6 +2944,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
+ u64 log_root_start;
+ u64 log_root_level;
mutex_lock(&root->log_mutex);
log_transid = ctx->log_transid;
@@ -3053,7 +2983,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -3068,6 +2998,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
+ /*
+ * -EAGAIN happens when someone, e.g., a concurrent transaction
+ * commit, writes a dirty extent in this tree-log commit. This
+ * concurrent write will create a hole writing out the extents,
+ * and we cannot proceed on a zoned filesystem, requiring
+ * sequential writing. While we can bail out to a full commit
+ * here, but we can continue hoping the concurrent writing fills
+ * the hole.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
+ ret = 0;
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
@@ -3102,32 +3043,33 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
+ if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
+ goto out;
+ }
+ }
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ }
+
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
- mutex_unlock(&log_root_tree->log_mutex);
-
- mutex_lock(&log_root_tree->log_mutex);
-
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
-
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /* atomic_dec_and_test implies a barrier */
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
- }
-
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
@@ -3142,7 +3084,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -3173,8 +3115,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
@@ -3183,7 +3123,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
@@ -3191,7 +3131,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
- if (ret) {
+ /*
+ * As described above, -EAGAIN indicates a hole in the extents. We
+ * cannot wait for these write outs since the waiting cause a
+ * deadlock. Bail out to the full commit instead.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ } else if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3207,32 +3157,63 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- btrfs_set_super_log_root(fs_info->super_for_commit,
- log_root_tree->node->start);
- btrfs_set_super_log_root_level(fs_info->super_for_commit,
- btrfs_header_level(log_root_tree->node));
-
+ log_root_start = log_root_tree->node->start;
+ log_root_level = btrfs_header_level(log_root_tree->node);
log_root_tree->log_transid++;
mutex_unlock(&log_root_tree->log_mutex);
/*
- * Nobody else is going to jump in and write the ctree
- * super here because the log_commit atomic below is protecting
- * us. We must be called with a transaction handle pinning
- * the running transaction open, so a full commit can't hop
- * in and cause problems either.
+ * Here we are guaranteed that nobody is going to write the superblock
+ * for the current transaction before us and that neither we do write
+ * our superblock before the previous transaction finishes its commit
+ * and writes its superblock, because:
+ *
+ * 1) We are holding a handle on the current transaction, so no body
+ * can commit it until we release the handle;
+ *
+ * 2) Before writing our superblock we acquire the tree_log_mutex, so
+ * if the previous transaction is still committing, and hasn't yet
+ * written its superblock, we wait for it to do it, because a
+ * transaction commit acquires the tree_log_mutex when the commit
+ * begins and releases it only after writing its superblock.
+ */
+ mutex_lock(&fs_info->tree_log_mutex);
+
+ /*
+ * The previous transaction writeout phase could have failed, and thus
+ * marked the fs in an error state. We must not commit here, as we
+ * could have updated our generation in the super_for_commit and
+ * writing the super here would result in transid mismatches. If there
+ * is an error here just bail.
*/
+ if (BTRFS_FS_ERROR(fs_info)) {
+ ret = -EIO;
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto out_wake_log_root;
+ }
+
+ btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
+ mutex_unlock(&fs_info->tree_log_mutex);
if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
- mutex_lock(&root->log_mutex);
- if (root->last_log_commit < log_transid)
- root->last_log_commit = log_transid;
- mutex_unlock(&root->log_mutex);
+ /*
+ * We know there can only be one task here, since we have not yet set
+ * root->log_commit[index1] to 0 and any task attempting to sync the
+ * log must wait for the previous log transaction to commit if it's
+ * still in progress or wait for the current log transaction commit if
+ * someone else already started it. We use <= and not < because the
+ * first log transaction has an ID of 0.
+ */
+ ASSERT(root->last_log_commit <= log_transid);
+ root->last_log_commit = log_transid;
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3273,18 +3254,44 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- ret = walk_log_tree(trans, log, &wc);
- if (ret) {
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ if (log->node) {
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ /*
+ * We weren't able to traverse the entire log tree, the
+ * typical scenario is getting an -EIO when reading an
+ * extent buffer of the tree, due to a previous writeback
+ * failure of it.
+ */
+ set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ &log->fs_info->fs_state);
+
+ /*
+ * Some extent buffers of the log tree may still be dirty
+ * and not yet written back to storage, because we may
+ * have updates to a log tree without syncing a log tree,
+ * such as during rename and link operations. So flush
+ * them out and wait for their writeback to complete, so
+ * that we properly cleanup their state and pages.
+ */
+ btrfs_write_marked_extents(log->fs_info,
+ &log->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ btrfs_wait_tree_log_extents(log,
+ EXTENT_DIRTY | EXTENT_NEW);
+
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
}
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- free_extent_buffer(log->node);
- kfree(log);
+ extent_io_tree_release(&log->log_csum_range);
+
+ btrfs_put_root(log);
}
/*
@@ -3296,6 +3303,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
@@ -3306,32 +3314,162 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
if (fs_info->log_root_tree) {
free_log_tree(trans, fs_info->log_root_tree);
fs_info->log_root_tree = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
}
return 0;
}
/*
- * Check if an inode was logged in the current transaction. We can't always rely
- * on an inode's logged_trans value, because it's an in-memory only field and
- * therefore not persisted. This means that its value is lost if the inode gets
- * evicted and loaded again from disk (in which case it has a value of 0, and
- * certainly it is smaller then any possible transaction ID), when that happens
- * the full_sync flag is set in the inode's runtime flags, so on that case we
- * assume eviction happened and ignore the logged_trans value, assuming the
- * worst case, that the inode was logged before in the current transaction.
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
*/
-static bool inode_logged(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+static int inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
if (inode->logged_trans == trans->transid)
- return true;
+ return 1;
+
+ /*
+ * If logged_trans is not 0, then we know the inode logged was not logged
+ * in this transaction, so we can return false right away.
+ */
+ if (inode->logged_trans > 0)
+ return 0;
+
+ /*
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transacion's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ inode->logged_trans = trans->transid;
- if (inode->last_trans == trans->transid &&
- test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
- !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
- return true;
+ /*
+ * If it's a directory, then we must set last_dir_index_offset to the
+ * maximum possible value, so that the next attempt to log the inode does
+ * not skip checking if dir index keys found in modified subvolume tree
+ * leaves have been logged before, otherwise it would result in attempts
+ * to insert duplicate dir index keys in the log tree. This must be done
+ * because last_dir_index_offset is an in-memory only field, not persisted
+ * in the inode item or any other on-disk structure, so its value is lost
+ * once the inode is evicted.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_dir_index_offset = (u64)-1;
+
+ return 1;
+}
+
+/*
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
+ */
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const char *name, int name_len,
+ u64 index)
+{
+ struct btrfs_dir_item *di;
+
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
- return false;
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_delete_one_dir_name(trans, log, path, di);
}
/*
@@ -3355,143 +3493,74 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index)
{
- struct btrfs_root *log;
- struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
- int err = 0;
- int bytes_del = 0;
- u64 dir_ino = btrfs_ino(dir);
- if (!inode_logged(trans, dir))
- return 0;
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
mutex_lock(&dir->log_mutex);
- log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_unlock;
}
- di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
- name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
- if (ret) {
- err = ret;
- goto fail;
- }
- }
- btrfs_release_path(path);
- di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
- if (ret) {
- err = ret;
- goto fail;
- }
- }
-
- /* update the directory size in the log to reflect the names
- * we have removed
- */
- if (bytes_del) {
- struct btrfs_key key;
-
- key.objectid = dir_ino;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
- btrfs_release_path(path);
-
- ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
- if (ret == 0) {
- struct btrfs_inode_item *item;
- u64 i_size;
-
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_inode_item);
- i_size = btrfs_inode_size(path->nodes[0], item);
- if (i_size > bytes_del)
- i_size -= bytes_del;
- else
- i_size = 0;
- btrfs_set_inode_size(path->nodes[0], item, i_size);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- } else
- ret = 0;
- btrfs_release_path(path);
- }
-fail:
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, name_len, index);
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
-
btrfs_end_log_trans(root);
-
- return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
u64 index;
int ret;
- if (!inode_logged(trans, inode))
- return 0;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
+ return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
if (ret)
- return 0;
+ return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
- if (ret == -ENOSPC) {
+ if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
-
- return ret;
}
/*
@@ -3502,7 +3571,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- int key_type, u64 dirid,
+ u64 dirid,
u64 first_offset, u64 last_offset)
{
int ret;
@@ -3511,49 +3580,236 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
key.objectid = dirid;
key.offset = first_offset;
- if (key_type == BTRFS_DIR_ITEM_KEY)
- key.type = BTRFS_DIR_LOG_ITEM_KEY;
- else
- key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct extent_buffer *src,
+ struct btrfs_path *dst_path,
+ int start_slot,
+ int count)
+{
+ char *ins_data = NULL;
+ struct btrfs_item_batch batch;
+ struct extent_buffer *dst;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+ struct btrfs_key key;
+ u32 item_size;
+ int ret;
+ int i;
+
+ ASSERT(count > 0);
+ batch.nr = count;
+
+ if (count == 1) {
+ btrfs_item_key_to_cpu(src, &key, start_slot);
+ item_size = btrfs_item_size(src, start_slot);
+ batch.keys = &key;
+ batch.data_sizes = &item_size;
+ batch.total_data_size = item_size;
+ } else {
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+
+ ins_data = kmalloc(count * sizeof(u32) +
+ count * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+
+ for (i = 0; i < count; i++) {
+ const int slot = start_slot + i;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+ ins_sizes[i] = btrfs_item_size(src, slot);
+ batch.total_data_size += ins_sizes[i];
+ }
+ }
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst = dst_path->nodes[0];
+ /*
+ * Copy all the items in bulk, in a single copy operation. Item data is
+ * organized such that it's placed at the end of a leaf and from right
+ * to left. For example, the data for the second item ends at an offset
+ * that matches the offset where the data for the first item starts, the
+ * data for the third item ends at an offset that matches the offset
+ * where the data of the second items starts, and so on.
+ * Therefore our source and destination start offsets for copy match the
+ * offsets of the last items (highest slots).
+ */
+ dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+ src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+ copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+ btrfs_release_path(dst_path);
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ struct extent_buffer *src = path->nodes[0];
+ const int nritems = btrfs_header_nritems(src);
+ const u64 ino = btrfs_ino(inode);
+ bool last_found = false;
+ int batch_start = 0;
+ int batch_size = 0;
+ int i;
+
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ int ret;
+
+ btrfs_item_key_to_cpu(src, &key, i);
+
+ if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
+ last_found = true;
+ break;
+ }
+
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ ctx->last_dir_item_offset = key.offset;
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
+ /*
+ * We must make sure that when we log a directory entry, the
+ * corresponding inode, after log replay, has a matching link
+ * count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our file inode
+ * would have a link count of 1, but we get two directory entries
+ * pointing to the same inode. After removing one of the names,
+ * it would not be possible to remove the other name, which
+ * resulted always in stale file handle errors, and would not be
+ * possible to rmdir the parent directory, since its i_size could
+ * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+ * resulting in -ENOTEMPTY errors.
+ */
+ if (!ctx->log_new_dentries) {
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(src, di, &di_key);
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
+ }
+
+ if (batch_size == 0)
+ batch_start = i;
+ batch_size++;
+ }
+
+ if (batch_size > 0) {
+ int ret;
+
+ ret = flush_dir_items_batch(trans, log, src, dst_path,
+ batch_start, batch_size);
+ if (ret < 0)
+ return ret;
+ }
+
+ return last_found ? 1 : 0;
+}
+
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path, int key_type,
+ struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
+ struct btrfs_root *root = inode->root;
struct btrfs_root *log = root->log_root;
- struct extent_buffer *src;
int err = 0;
int ret;
- int i;
- int nritems;
- u64 first_offset = min_offset;
+ u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
- log = root->log_root;
-
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = min_offset;
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
@@ -3562,9 +3818,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* we didn't find anything from this transaction, see if there
* is anything at all
*/
- if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ if (ret != 0 || min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
min_key.objectid = ino;
- min_key.type = key_type;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
min_key.offset = (u64)-1;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -3572,7 +3829,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return ret;
}
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
@@ -3581,29 +3838,31 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
- if (key_type == tmp.type)
- first_offset = max(min_offset, tmp.offset) + 1;
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
}
goto done;
}
/* go backward to find any previous key */
- ret = btrfs_previous_item(root, path, ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (key_type == tmp.type) {
- first_offset = tmp.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret) {
- err = ret;
- goto done;
- }
- }
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
}
btrfs_release_path(path);
@@ -3615,6 +3874,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3624,55 +3884,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* from our directory
*/
while (1) {
- struct btrfs_key tmp;
- src = path->nodes[0];
- nritems = btrfs_header_nritems(src);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
-
- btrfs_item_key_to_cpu(src, &min_key, i);
-
- if (min_key.objectid != ino || min_key.type != key_type)
- goto done;
- ret = overwrite_item(trans, log, dst_path, src, i,
- &min_key);
- if (ret) {
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
+ if (ret != 0) {
+ if (ret < 0)
err = ret;
- goto done;
- }
-
- /*
- * We must make sure that when we log a directory entry,
- * the corresponding inode, after log replay, has a
- * matching link count. For example:
- *
- * touch foo
- * mkdir mydir
- * sync
- * ln foo mydir/bar
- * xfs_io -c "fsync" mydir
- * <crash>
- * <mount fs and log replay>
- *
- * Would result in a fsync log that when replayed, our
- * file inode would have a link count of 1, but we get
- * two directory entries pointing to the same inode.
- * After removing one of the names, it would not be
- * possible to remove the other name, which resulted
- * always in stale file handle errors, and would not
- * be possible to rmdir the parent directory, since
- * its i_size could never decrement to the value
- * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
- */
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
- btrfs_dir_item_key_to_cpu(src, di, &tmp);
- if (ctx &&
- (btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- tmp.type != BTRFS_ROOT_ITEM_KEY)
- ctx->log_new_dentries = true;
+ goto done;
}
- path->slots[0] = nritems;
+ path->slots[0] = btrfs_header_nritems(path->nodes[0]);
/*
* look ahead to the next item and see if it is also
@@ -3686,21 +3905,29 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
err = ret;
goto done;
}
- btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != ino || tmp.type != key_type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+ if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret)
- err = ret;
- else
- last_offset = tmp.offset;
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
goto done;
}
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
}
done:
btrfs_release_path(path);
@@ -3709,18 +3936,91 @@ done:
if (err == 0) {
*last_offset_ret = last_offset;
/*
- * insert the log range keys to indicate where the log
- * is valid
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
*/
- ret = insert_dir_log_key(trans, log, path, key_type,
- ino, first_offset, last_offset);
- if (ret)
- err = ret;
+ ASSERT(last_old_dentry_offset <= last_offset);
+ if (last_old_dentry_offset < last_offset) {
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
}
return err;
}
/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != (u64)-1)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
* logging directories is very similar to logging inodes, We find all the items
* from the current transaction and write them to the log.
*
@@ -3733,7 +4033,7 @@ done:
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
@@ -3741,13 +4041,17 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 min_key;
u64 max_key;
int ret;
- int key_type = BTRFS_DIR_ITEM_KEY;
-again:
- min_key = 0;
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
+ min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
+ ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
while (1) {
- ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+ ret = log_dir_items(trans, inode, path, dst_path,
ctx, min_key, &max_key);
if (ret)
return ret;
@@ -3756,10 +4060,8 @@ again:
min_key = max_key + 1;
}
- if (key_type == BTRFS_DIR_ITEM_KEY) {
- key_type = BTRFS_DIR_INDEX_KEY;
- goto again;
- }
+ inode->last_dir_index_offset = ctx->last_dir_item_offset;
+
return 0;
}
@@ -3769,17 +4071,18 @@ again:
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 objectid, int max_key_type)
+ struct btrfs_inode *inode,
+ int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
- key.objectid = objectid;
+ key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -3796,13 +4099,12 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- if (found_key.objectid != objectid)
+ if (found_key.objectid != key.objectid)
break;
found_key.offset = 0;
found_key.type = 0;
- ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
- &start_slot);
+ ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
if (ret < 0)
break;
@@ -3822,6 +4124,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
return ret;
}
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_inode *inode,
+ u64 new_size, u32 min_type)
+{
+ struct btrfs_truncate_control control = {
+ .new_size = new_size,
+ .ino = btrfs_ino(inode),
+ .min_type = min_type,
+ .skip_ref_updates = true,
+ };
+
+ return btrfs_truncate_inode_items(trans, log_root, &control);
+}
+
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
@@ -3829,6 +4146,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
u64 logged_isize)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
@@ -3838,56 +4156,89 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
+ btrfs_set_token_inode_generation(&token, item, 0);
+ btrfs_set_token_inode_size(&token, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(leaf, item,
- BTRFS_I(inode)->generation,
- &token);
- btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
- }
-
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->atime,
- inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
- inode->i_atime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
- inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
- inode->i_mtime.tv_nsec, &token);
-
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
- inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
- inode->i_ctime.tv_nsec, &token);
-
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- &token);
-
- btrfs_set_token_inode_sequence(leaf, item,
- inode_peek_iversion(inode), &token);
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ btrfs_set_token_inode_generation(&token, item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_token_inode_size(&token, item, inode->i_size);
+ }
+
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+
+ btrfs_set_token_timespec_sec(&token, &item->atime,
+ inode->i_atime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
+ inode->i_mtime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
+ inode->i_ctime.tv_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ inode->i_ctime.tv_nsec);
+
+ /*
+ * We do not need to set the nbytes field, in fact during a fast fsync
+ * its value may not even be correct, since a fast fsync does not wait
+ * for ordered extent completion, which is where we update nbytes, it
+ * only waits for writeback to complete. During log replay as we find
+ * file extent items and replay them, we adjust the nbytes field of the
+ * inode item in subvolume tree as needed (see overwrite_item()).
+ */
+
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
+ btrfs_set_token_inode_block_group(&token, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct btrfs_path *path,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
int ret;
- ret = btrfs_insert_empty_item(trans, log, path,
- &inode->location, sizeof(*inode_item));
- if (ret && ret != -EEXIST)
+ /*
+ * If we are doing a fast fsync and the inode was logged before in the
+ * current transaction, then we know the inode was previously logged and
+ * it exists in the log tree. For performance reasons, in this case use
+ * btrfs_search_slot() directly with ins_len set to 0 so that we never
+ * attempt a write lock on the leaf's parent, which adds unnecessary lock
+ * contention in case there are concurrent fsyncs for other inodes of the
+ * same subvolume. Using btrfs_insert_empty_item() when the inode item
+ * already exists can also result in unnecessarily splitting a leaf.
+ */
+ if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+ ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ASSERT(ret <= 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ } else {
+ /*
+ * This means it is the first fsync in the current transaction,
+ * so the inode item is not in the log and we need to insert it.
+ * We can never get -EEXIST because we are only called for a fast
+ * fsync and in case an inode eviction happens after the inode was
+ * logged before in the current transaction, when we load again
+ * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+ * flags and set ->logged_trans to 0.
+ */
+ ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ sizeof(*inode_item));
+ ASSERT(ret != -EEXIST);
+ }
+ if (ret)
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
@@ -3898,12 +4249,33 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
}
static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
+ const u64 lock_end = sums->bytenr + sums->len - 1;
+ struct extent_state *cached_state = NULL;
int ret;
/*
+ * If this inode was not used for reflink operations in the current
+ * transaction with new extents, then do the fast path, no need to
+ * worry about logging checksum items with overlapping ranges.
+ */
+ if (inode->last_reflink_trans < trans->transid)
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+
+ /*
+ * Serialize logging for checksums. This is to avoid racing with the
+ * same checksum being logged by another task that is logging another
+ * file which happens to refer to the same extent as well. Such races
+ * can leave checksum items in the log with overlapping ranges.
+ */
+ ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
+ if (ret)
+ return ret;
+ /*
* Due to extent cloning, we might have logged a csum item that covers a
* subrange of a cloned extent, and later we can end up logging a csum
* item for a larger subrange of the same extent or the entire range.
@@ -3913,10 +4285,13 @@ static int log_csums(struct btrfs_trans_handle *trans,
* trim and adjust) any existing csum items in the log for this range.
*/
ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
- if (ret)
- return ret;
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
+
+ unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
- return btrfs_csum_file_blocks(trans, log_root, sums);
+ return ret;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3926,22 +4301,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- unsigned long src_offset;
- unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- int ret;
+ int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
+ struct btrfs_item_batch batch;
char *ins_data;
int i;
- struct list_head ordered_sums;
- int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
-
- INIT_LIST_HEAD(&ordered_sums);
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -3950,27 +4321,155 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+ batch.keys = ins_keys;
+ batch.data_sizes = ins_sizes;
+ batch.total_data_size = 0;
+ batch.nr = 0;
+ dst_index = 0;
for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
- btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
- }
- ret = btrfs_insert_empty_items(trans, log, dst_path,
- ins_keys, ins_sizes, nr);
- if (ret) {
- kfree(ins_data);
- return ret;
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
+
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+ if (skip_csum)
+ goto add_to_batch;
+
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
+
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
+ } else {
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
+ }
+
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0, false);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
}
- for (i = 0; i < nr; i++, dst_path->slots[0]++) {
- dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
- dst_path->slots[0]);
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ goto out;
+
+ ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+ if (ret)
+ goto out;
+
+ dst_index = 0;
+ for (i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
+ */
+ if (dst_index >= batch.nr)
+ break;
+
+ btrfs_item_key_to_cpu(src, &key, src_slot);
- src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
- if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(dst_path->nodes[0],
- dst_path->slots[0],
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
+
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
&inode->vfs_inode,
@@ -3978,80 +4477,24 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
+ src_offset, ins_sizes[dst_index]);
}
- /* take a reference on file data extents so that truncates
- * or deletes of this inode don't have to relog the inode
- * again
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
- !skip_csum) {
- int found_type;
- extent = btrfs_item_ptr(src, start_slot + i,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(src, extent) < trans->transid)
- continue;
-
- found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- u64 ds, dl, cs, cl;
- ds = btrfs_file_extent_disk_bytenr(src,
- extent);
- /* ds == 0 is a hole */
- if (ds == 0)
- continue;
-
- dl = btrfs_file_extent_disk_num_bytes(src,
- extent);
- cs = btrfs_file_extent_offset(src, extent);
- cl = btrfs_file_extent_num_bytes(src,
- extent);
- if (btrfs_file_extent_compression(src,
- extent)) {
- cs = 0;
- cl = dl;
- }
-
- ret = btrfs_lookup_csums_range(
- fs_info->csum_root,
- ds + cs, ds + cs + cl - 1,
- &ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
- }
- }
+ dst_index++;
}
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
+out:
kfree(ins_data);
- /*
- * we have to do this after the loop above to avoid changing the
- * log tree while trying to change the log tree.
- */
- ret = 0;
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = log_csums(trans, log, sums);
- list_del(&sums->list);
- kfree(sums);
- }
-
return ret;
}
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
- struct extent_map *em1, *em2;
+ const struct extent_map *em1, *em2;
em1 = list_entry(a, struct extent_map, list);
em2 = list_entry(b, struct extent_map, list);
@@ -4066,10 +4509,15 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *csum_root;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4078,20 +4526,79 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
- ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
+ csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
+ ret = btrfs_lookup_csums_range(csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0);
+ csum_len - 1, &ordered_sums, 0, false);
if (ret)
return ret;
@@ -4100,7 +4607,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = log_csums(trans, log_root, sums);
+ ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4109,83 +4616,79 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
static int log_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_root *root,
+ struct btrfs_inode *inode,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_root *log = root->log_root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_drop_extents_args drop_args = { 0 };
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
- struct btrfs_map_token token;
struct btrfs_key key;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
- if (ret)
- return ret;
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
+ extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
- ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
- if (!extent_inserted) {
+ /*
+ * If this is the first time we are logging the inode in the current
+ * transaction, we can avoid btrfs_drop_extents(), which is expensive
+ * because it does a deletion search, which always acquires write locks
+ * for extent buffers at levels 2, 1 and 0. This not only wastes time
+ * but also adds significant contention in a log tree, since log trees
+ * are small, with a root at level 2 or 3 at most, due to their short
+ * life span.
+ */
+ if (ctx->logged_before) {
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+ if (ret)
+ return ret;
+ }
+
+ if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = em->start;
ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
+ sizeof(fi));
if (ret)
return ret;
}
leaf = path->nodes[0];
- btrfs_init_map_token(&token, leaf);
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
- &token);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_PREALLOC,
- &token);
- else
- btrfs_set_token_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG,
- &token);
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start,
- &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
- em->block_start -
- extent_offset, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
- &token);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
- &token);
- }
-
- btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
- &token);
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -4195,7 +4698,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4211,6 +4714,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
bool dropped_extents = false;
+ u64 truncate_offset = i_size;
+ struct extent_buffer *leaf;
+ int slot;
int ins_nr = 0;
int start_slot;
int ret;
@@ -4225,9 +4731,43 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
+ /*
+ * We must check if there is a prealloc extent that starts before the
+ * i_size and crosses the i_size boundary. This is to ensure later we
+ * truncate down to the end of that extent and not to the i_size, as
+ * otherwise we end up losing part of the prealloc extent after a log
+ * replay and with an implicit hole if there is another prealloc extent
+ * that starts at an offset beyond i_size.
+ */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ struct btrfs_file_extent_item *ei;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 extent_end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, ei);
+
+ if (extent_end > i_size)
+ truncate_offset = extent_end;
+ }
+ } else {
+ ret = 0;
+ }
+
while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ leaf = path->nodes[0];
+ slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
@@ -4261,13 +4801,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
- do {
- ret = btrfs_truncate_inode_items(trans,
- root->log_root,
- &inode->vfs_inode,
- i_size,
- BTRFS_EXTENT_DATA_KEY);
- } while (ret == -EAGAIN);
+ ret = truncate_inode_items(trans, root->log_root, inode,
+ truncate_offset,
+ BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
@@ -4284,12 +4820,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
}
}
- if (ins_nr > 0) {
+ if (ins_nr > 0)
ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
- if (ret > 0)
- ret = 0;
- }
out:
btrfs_release_path(path);
btrfs_free_path(dst_path);
@@ -4297,43 +4830,23 @@ out:
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 test_gen;
int ret = 0;
int num = 0;
INIT_LIST_HEAD(&extents);
write_lock(&tree->lock);
- test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4347,7 +4860,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
goto process;
}
- if (em->generation <= test_gen)
+ if (em->generation < trans->transid)
continue;
/* We log prealloc extents beyond eof later. */
@@ -4381,7 +4894,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path, ctx);
+ ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -4389,11 +4902,34 @@ process:
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4446,16 +4982,20 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path)
{
+ struct btrfs_root *root = inode->root;
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
int ins_nr = 0;
int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
key.objectid = ino;
key.type = BTRFS_XATTR_ITEM_KEY;
@@ -4494,6 +5034,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
start_slot = slot;
ins_nr++;
path->slots[0]++;
+ found_xattrs = true;
cond_resched();
}
if (ins_nr > 0) {
@@ -4503,6 +5044,9 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
return ret;
}
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
return 0;
}
@@ -4516,10 +5060,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
@@ -4539,9 +5083,7 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
return ret;
while (true) {
- struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf = path->nodes[0];
- u64 len;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
@@ -4568,10 +5110,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
* leafs from the log root.
*/
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0,
- 0, hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
if (ret < 0)
return ret;
@@ -4590,18 +5131,7 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
}
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- prev_extent_end = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(leaf, extent);
- prev_extent_end = key.offset + len;
- }
-
+ prev_extent_end = btrfs_file_extent_end(path);
path->slots[0]++;
cond_resched();
}
@@ -4611,10 +5141,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0, 0,
- hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
if (ret < 0)
return ret;
}
@@ -4674,7 +5202,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
struct btrfs_path *search_path;
char *name = NULL;
u32 name_len = 0;
- u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 item_size = btrfs_item_size(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
@@ -4757,115 +5285,461 @@ out:
return ret;
}
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ const struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * do_overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (true) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ bool continue_curr_inode = true;
+ int nritems;
+ int i;
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
+ log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ if (continue_curr_inode && min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+
+next:
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
+ return ret;
+}
+
struct btrfs_ino_list {
u64 ino;
u64 parent;
struct list_head list;
};
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- u64 ino, u64 parent)
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = 0;
+ path->skip_locking = 0;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- LIST_HEAD(inode_list);
- int ret = 0;
+ struct inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
- list_add_tail(&ino_elem->list, &inode_list);
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
- while (!list_empty(&inode_list)) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct inode *inode;
+ return 0;
+}
- ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
- list);
- ino = ino_elem->ino;
- parent = ino_elem->parent;
- list_del(&ino_elem->list);
- kfree(ino_elem);
- if (ret)
- continue;
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
- btrfs_release_path(path);
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
- key.objectid = ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root);
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
+
+ inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
- * directory.
+ * directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- key.objectid = parent;
- inode = btrfs_iget(fs_info->sb, &key, root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- } else {
- ret = btrfs_log_inode(trans, root,
- BTRFS_I(inode),
- LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
- btrfs_add_delayed_iput(inode);
- }
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget(fs_info->sb, parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
}
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
continue;
}
+
/*
- * If the inode was already logged skip it - otherwise we can
- * hit an infinite loop. Example:
- *
- * From the commit root (previous transaction) we have the
- * following inodes:
- *
- * inode 257 a directory
- * inode 258 with references "zz" and "zz_link" on inode 257
- * inode 259 with reference "a" on inode 257
- *
- * And in the current (uncommitted) transaction we have:
- *
- * inode 257 a directory, unchanged
- * inode 258 with references "a" and "a2" on inode 257
- * inode 259 with reference "zz_link" on inode 257
- * inode 261 with reference "zz" on inode 257
- *
- * When logging inode 261 the following infinite loop could
- * happen if we don't skip already logged inodes:
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
*
- * - we detect inode 258 as a conflicting inode, with inode 261
- * on reference "zz", and log it;
- *
- * - we detect inode 259 as a conflicting inode, with inode 258
- * on reference "a", and log it;
- *
- * - we detect inode 258 as a conflicting inode, with inode 259
- * on reference "zz_link", and log it - again! After this we
- * repeat the above steps forever.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- /*
- * Check the inode's logged_trans only instead of
- * btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
*/
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- spin_unlock(&BTRFS_I(inode)->lock);
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
btrfs_add_delayed_iput(inode);
continue;
}
- spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
@@ -4873,68 +5747,539 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
- if (ret) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
+ }
- key.objectid = ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_add_delayed_iput(inode);
- continue;
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
+ return ret;
+}
+
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_key *min_key,
+ const struct btrfs_key *max_key,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ const u64 logged_isize,
+ const int inode_only,
+ struct btrfs_log_ctx *ctx,
+ bool *need_log_inode_item)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ struct btrfs_root *root = inode->root;
+ int ins_start_slot = 0;
+ int ins_nr = 0;
+ int ret;
+
+ while (1) {
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
}
+again:
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key->objectid != max_key->objectid)
+ break;
+ if (min_key->type > max_key->type)
+ break;
- while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
+ *need_log_inode_item = false;
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
u64 other_ino = 0;
u64 other_parent = 0;
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- break;
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0], min_key, inode,
+ &other_ino, &other_parent);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0 &&
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
}
- continue;
- }
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr,
+ inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != ino ||
- (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = 0;
- break;
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
+ if (ret)
+ return ret;
+ goto next_key;
}
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ goto next_slot;
+ }
+
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
- ret = btrfs_check_ref_name_override(leaf, slot, &key,
- BTRFS_I(inode), &other_ino,
- &other_parent);
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+next_key:
+ if (min_key->offset < (u64)-1) {
+ min_key->offset++;
+ } else if (min_key->type < max_key->type) {
+ min_key->type++;
+ min_key->offset = 0;
+ } else {
+ break;
+ }
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
+
+ return ret;
+}
+
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc(max_batch_size * sizeof(u32) +
+ max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
break;
- if (ret > 0) {
- ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
- if (!ino_elem) {
- ret = -ENOMEM;
- break;
- }
- ino_elem->ino = other_ino;
- ino_elem->parent = other_parent;
- list_add_tail(&ino_elem->list, &inode_list);
- ret = 0;
- }
- path->slots[0]++;
+ curr = next;
+ next = list_next_entry(next, log_list);
}
- btrfs_add_delayed_iput(inode);
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ continue;
+ }
+
+ if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+
+ btrfs_add_delayed_iput(di_inode);
+
+ if (ret)
+ break;
}
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
+
return ret;
}
@@ -4953,30 +6298,26 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
- struct btrfs_root *log = root->log_root;
- int err = 0;
+ struct btrfs_root *log = inode->root->log_root;
int ret;
- int nritems;
- int ins_start_slot = 0;
- int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
- bool recursive_logging = false;
+ bool inode_item_dropped = true;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
path = btrfs_alloc_path();
if (!path)
@@ -5004,33 +6345,81 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
/*
- * Only run delayed items if we are a dir or a new file.
- * Otherwise commit the delayed inode only, which is needed in
- * order for the log replay code to mark inodes for link count
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
- inode->generation > fs_info->last_trans_committed)
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else
- ret = btrfs_commit_inode_delayed_inode(inode);
-
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
+ if (ret)
+ goto out;
}
- if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
- recursive_logging = true;
- if (inode_only == LOG_OTHER_INODE)
- inode_only = LOG_INODE_EXISTS;
- else
- inode_only = LOG_INODE_ALL;
- mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
- } else {
- mutex_lock(&inode->log_mutex);
+ mutex_lock(&inode->log_mutex);
+
+ /*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
+ * This is for cases where logging a directory could result in losing a
+ * a file after replaying the log. For example, if we move a file from a
+ * directory A to a directory B, then fsync directory A, we have no way
+ * to known the file was moved from A to B, so logging just A would
+ * result in losing the file after a log replay.
+ */
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
+ btrfs_set_log_full_commit(trans);
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_unlock;
}
/*
@@ -5038,13 +6427,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ BTRFS_XATTR_ITEM_KEY);
} else {
- if (inode_only == LOG_INODE_EXISTS) {
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5058,27 +6446,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
- err = logged_inode_size(log, inode, path, &logged_isize);
- if (err)
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- while(1) {
- ret = btrfs_truncate_inode_items(trans,
- log, &inode->vfs_inode, 0, 0);
- if (ret != -EAGAIN)
- break;
- }
+ if (ctx->logged_before)
+ ret = truncate_inode_items(trans, log,
+ inode, 0, 0);
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
@@ -5086,520 +6472,163 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path, ino,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
+ inode_item_dropped = false;
goto log_extents;
}
}
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
- while (1) {
- ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key,
- path, trans->transid);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- if (ret != 0)
- break;
-again:
- /* note, ins_nr might be > 0 here, cleanup outside the loop */
- if (min_key.objectid != ino)
- break;
- if (min_key.type > max_key.type)
- break;
-
- if (min_key.type == BTRFS_INODE_ITEM_KEY)
- need_log_inode_item = false;
-
- if ((min_key.type == BTRFS_INODE_REF_KEY ||
- min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- ret = btrfs_check_ref_name_override(path->nodes[0],
- path->slots[0], &min_key, inode,
- &other_ino, &other_parent);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- } else if (ret > 0 && ctx &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- if (ins_nr > 0) {
- ins_nr++;
- } else {
- ins_nr = 1;
- ins_start_slot = path->slots[0];
- }
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
-
- err = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
- if (err)
- goto out_unlock;
- btrfs_release_path(path);
- goto next_key;
- }
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (ins_nr == 0)
- goto next_slot;
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- goto next_slot;
- }
-
- if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
- ins_nr++;
- goto next_slot;
- } else if (!ins_nr) {
- ins_start_slot = path->slots[0];
- ins_nr = 1;
- goto next_slot;
- }
-
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 1;
- ins_start_slot = path->slots[0];
-next_slot:
-
- nritems = btrfs_header_nritems(path->nodes[0]);
- path->slots[0]++;
- if (path->slots[0] < nritems) {
- btrfs_item_key_to_cpu(path->nodes[0], &min_key,
- path->slots[0]);
- goto again;
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
- btrfs_release_path(path);
-next_key:
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- } else if (min_key.type < max_key.type) {
- min_key.type++;
- min_key.offset = 0;
- } else {
- break;
- }
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ path, dst_path, logged_isize,
+ inode_only, ctx,
+ &need_log_inode_item);
+ if (ret)
+ goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
- if (err)
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
goto out_unlock;
}
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode);
- if (!err && !xattrs_logged) {
- err = btrfs_log_all_xattrs(trans, root, inode, path,
- dst_path);
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
+ goto out_unlock;
+ /*
+ * If we are doing a fast fsync and the inode was logged before
+ * in this transaction, we don't need to log the xattrs because
+ * they were logged before. If xattrs were added, changed or
+ * deleted since the last time we logged the inode, then we have
+ * already logged them because the inode had the runtime flag
+ * BTRFS_INODE_COPY_EVERYTHING set.
+ */
+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
+ goto out_unlock;
btrfs_release_path(path);
}
- if (err)
- goto out_unlock;
}
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
- if (ret) {
- err = ret;
+ ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
+ if (ret)
goto out_unlock;
- }
} else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
- if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path,
- ctx);
- if (ret) {
- err = ret;
+ if (full_dir_logging) {
+ ret = log_directory_changes(trans, inode, path, dst_path, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
goto out_unlock;
- }
}
- /*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
- */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ /*
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for three reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
+ *
+ * 3) If we are logging that an ancestor inode exists as part of
+ * logging a new name from a link or rename operation, don't update
+ * its last_log_commit - otherwise if an explicit fsync is made
+ * against an ancestor, the fsync considers the inode in the log
+ * and doesn't sync the log, resulting in the ancestor missing after
+ * a power failure unless the log was synced as part of an fsync
+ * against any other unrelated inode.
+ */
+ if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
-out_unlock:
- mutex_unlock(&inode->log_mutex);
-
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return err;
-}
-
-/*
- * Check if we must fallback to a transaction commit when logging an inode.
- * This must be called after logging the inode and is used only in the context
- * when fsyncing an inode requires the need to log some other inode - in which
- * case we can't lock the i_mutex of each other inode we need to log as that
- * can lead to deadlocks with concurrent fsync against other inodes (as we can
- * log inodes up or down in the hierarchy) or rename operations for example. So
- * we take the log_mutex of the inode after we have logged it and then check for
- * its last_unlink_trans value - this is safe because any task setting
- * last_unlink_trans must take the log_mutex and it must do this before it does
- * the actual unlink operation, so if we do this check before a concurrent task
- * sets last_unlink_trans it means we've logged a consistent version/state of
- * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task might have only updated last_unlink_trans before
- * we logged the inode or it might have also done the unlink).
- */
-static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- bool ret = false;
-
- mutex_lock(&inode->log_mutex);
- if (inode->last_unlink_trans > fs_info->last_trans_committed) {
- /*
- * Make sure any commits to the log are forced to be full
- * commits.
- */
- btrfs_set_log_full_commit(trans);
- ret = true;
- }
- mutex_unlock(&inode->log_mutex);
-
- return ret;
-}
-
-/*
- * follow the dentry parent pointers up the chain and see if any
- * of the directories in it require a full commit before they can
- * be logged. Returns zero if nothing special needs to be done or 1 if
- * a full commit is required.
- */
-static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode,
- struct dentry *parent,
- struct super_block *sb,
- u64 last_committed)
-{
- int ret = 0;
- struct dentry *old_parent = NULL;
/*
- * for regular files, if its inode is already on disk, we don't
- * have to worry about the parents at all. This is because
- * we can use the last_unlink_trans field to record renames
- * and other fun in this file.
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
*/
- if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed)
- goto out;
-
- if (!S_ISDIR(inode->vfs_inode.i_mode)) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- goto out;
- inode = BTRFS_I(d_inode(parent));
- }
-
- while (1) {
- if (btrfs_must_commit_transaction(trans, inode)) {
- ret = 1;
- break;
- }
-
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
- if (IS_ROOT(parent)) {
- inode = BTRFS_I(d_inode(parent));
- if (btrfs_must_commit_transaction(trans, inode))
- ret = 1;
- break;
- }
-
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- inode = BTRFS_I(d_inode(parent));
-
- }
- dput(old_parent);
+out_unlock:
+ mutex_unlock(&inode->log_mutex);
out:
- return ret;
-}
-
-struct btrfs_dir_list {
- u64 ino;
- struct list_head list;
-};
-
-/*
- * Log the inodes of the new dentries of a directory. See log_dir_items() for
- * details about the why it is needed.
- * This is a recursive operation - if an existing dentry corresponds to a
- * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
- * the dentries point to we do not lock their i_mutex, otherwise lockdep
- * complains about the following circular lock dependency / possible deadlock:
- *
- * CPU0 CPU1
- * ---- ----
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(sb_internal#2);
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(&sb->s_type->i_mutex_key#14);
- *
- * Where sb_internal is the lock (a counter that works as a lock) acquired by
- * sb_start_intwrite() in btrfs_start_transaction().
- * Not locking i_mutex of the inodes is still safe because:
- *
- * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
- * that while logging the inode new references (names) are added or removed
- * from the inode, leaving the logged inode item with a link count that does
- * not match the number of logged inode reference items. This is fine because
- * at log replay time we compute the real number of links and correct the
- * link count in the inode item (see replay_one_buffer() and
- * link_to_fixup_dir());
- *
- * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
- * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
- * has a size that doesn't match the sum of the lengths of all the logged
- * names. This does not result in a problem because if a dir_item key is
- * logged but its matching dir_index key is not logged, at log replay time we
- * don't use it to replay the respective name (see replay_one_name()). On the
- * other hand if only the dir_index key ends up being logged, the respective
- * name is added to the fs/subvol tree with both the dir_item and dir_index
- * keys created (see replay_one_name()).
- * The directory's inode item with a wrong i_size is not a problem as well,
- * since we don't use it at log replay time to set the i_size in the inode
- * item of the fs/subvol tree (see overwrite_item()).
- */
-static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *start_inode,
- struct btrfs_log_ctx *ctx)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *log = root->log_root;
- struct btrfs_path *path;
- LIST_HEAD(dir_list);
- struct btrfs_dir_list *dir_elem;
- int ret = 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
- if (!dir_elem) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
- dir_elem->ino = btrfs_ino(start_inode);
- list_add_tail(&dir_elem->list, &dir_list);
-
- while (!list_empty(&dir_list)) {
- struct extent_buffer *leaf;
- struct btrfs_key min_key;
- int nritems;
- int i;
-
- dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
- list);
- if (ret)
- goto next_dir_inode;
-
- min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_ITEM_KEY;
- min_key.offset = 0;
-again:
- btrfs_release_path(path);
- ret = btrfs_search_forward(log, &min_key, path, trans->transid);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
-
-process_leaf:
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
- struct btrfs_key di_key;
- struct inode *di_inode;
- struct btrfs_dir_list *new_dir_elem;
- int log_mode = LOG_INODE_EXISTS;
- int type;
-
- btrfs_item_key_to_cpu(leaf, &min_key, i);
- if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_ITEM_KEY)
- goto next_dir_inode;
-
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
- type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid &&
- type != BTRFS_FT_DIR)
- continue;
- btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
- if (di_key.type == BTRFS_ROOT_ITEM_KEY)
- continue;
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
- btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, &di_key, root);
- if (IS_ERR(di_inode)) {
- ret = PTR_ERR(di_inode);
- goto next_dir_inode;
- }
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
- if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- btrfs_add_delayed_iput(di_inode);
- break;
- }
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
- ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
- log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
- ret = 1;
- btrfs_add_delayed_iput(di_inode);
- if (ret)
- goto next_dir_inode;
- if (ctx->log_new_dentries) {
- new_dir_elem = kmalloc(sizeof(*new_dir_elem),
- GFP_NOFS);
- if (!new_dir_elem) {
- ret = -ENOMEM;
- goto next_dir_inode;
- }
- new_dir_elem->ino = di_key.objectid;
- list_add_tail(&new_dir_elem->list, &dir_list);
- }
- break;
- }
- if (i == nritems) {
- ret = btrfs_next_leaf(log, path);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
- goto process_leaf;
- }
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- goto again;
- }
-next_dir_inode:
- list_del(&dir_elem->list);
- kfree(dir_elem);
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
}
- btrfs_free_path(path);
return ret;
}
@@ -5648,7 +6677,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
break;
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
@@ -5672,7 +6701,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, &inode_key, root);
+ dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
+ root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -5701,15 +6731,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
- if (ctx)
- ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
- ret = 1;
- if (!ret && ctx && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, root,
+ if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
+ btrfs_add_delayed_iput(dir_inode);
+ continue;
+ }
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
+ LOG_INODE_ALL, ctx);
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
@@ -5734,26 +6765,28 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
while (true) {
struct btrfs_fs_info *fs_info = root->fs_info;
- const u64 last_committed = fs_info->last_trans_committed;
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
struct btrfs_key search_key;
struct inode *inode;
+ u64 ino;
int ret = 0;
btrfs_release_path(path);
+ ino = found_key.offset;
+
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &search_key, root);
+ inode = btrfs_iget(fs_info->sb, ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation > last_committed)
- ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ if (BTRFS_I(inode)->generation >= trans->transid &&
+ need_log_inode(trans, BTRFS_I(inode)))
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5792,7 +6825,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct dentry *old_parent = NULL;
struct super_block *sb = inode->vfs_inode.i_sb;
int ret = 0;
@@ -5806,9 +6838,10 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (root != inode->root)
break;
- if (inode->generation > fs_info->last_trans_committed) {
- ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode)) {
+ ret = btrfs_log_inode(trans, inode,
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5916,51 +6949,31 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct super_block *sb;
int ret = 0;
- u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
- sb = inode->vfs_inode.i_sb;
-
if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = 1;
- goto end_no_trans;
- }
-
- /*
- * The prev transaction commit doesn't complete, we need do
- * full commit by ourselves.
- */
- if (fs_info->last_trans_log_full_commit >
- fs_info->last_trans_committed) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
if (btrfs_root_refs(&root->root_item) == 0) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
- ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
- last_committed);
- if (ret)
- goto end_no_trans;
-
/*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
*/
- if (btrfs_inode_in_log(inode, trans->transid) ||
+ if ((btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents)) ||
inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
@@ -5970,7 +6983,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -5981,13 +6994,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed) {
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+ if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
log_dentries = true;
/*
@@ -6031,7 +7044,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
* and has a link count of 2.
*/
- if (inode->last_unlink_trans > last_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
@@ -6042,13 +7055,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, inode, ctx);
+ ret = log_new_dir_dentries(trans, inode, ctx);
else
ret = 0;
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
if (ret)
@@ -6066,15 +7079,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6091,7 +7102,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_key tmp_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
@@ -6116,8 +7126,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to pin buffers while recovering log root tree.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
@@ -6130,8 +7139,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't find tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
if (ret > 0) {
@@ -6145,19 +7153,15 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root(log_root_tree, &found_key);
+ log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read tree log root.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
- tmp_key.objectid = found_key.offset;
- tmp_key.type = BTRFS_ROOT_ITEM_KEY;
- tmp_key.offset = (u64)-1;
-
- wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+ wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
+ true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
@@ -6173,27 +7177,30 @@ again:
* each subsequent pass.
*/
if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(fs_info,
+ ret = btrfs_pin_extent_for_log_replay(trans,
log->node->start,
log->node->len);
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ btrfs_put_root(log);
if (!ret)
goto next;
- btrfs_handle_fs_error(fs_info, ret,
- "Couldn't read target root for tree log recovery.");
+ btrfs_abort_transaction(trans, ret);
goto error;
}
wc.replay_dest->log_root = log;
- btrfs_record_root_in_trans(trans, wc.replay_dest);
- ret = walk_log_tree(trans, log, &wc);
+ ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
+ if (ret)
+ /* The loop needs to continue due to the root refs */
+ btrfs_abort_transaction(trans, ret);
+ else
+ ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6209,14 +7216,14 @@ again:
* root->objectid_mutex is not acquired as log replay
* could only happen during mount.
*/
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
wc.replay_dest->log_root = NULL;
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ btrfs_put_root(wc.replay_dest);
+ btrfs_put_root(log);
if (ret)
goto error;
@@ -6247,15 +7254,15 @@ next:
if (ret)
return ret;
- free_extent_buffer(log_root_tree->node);
log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- kfree(log_root_tree);
+ btrfs_put_root(log_root_tree);
return 0;
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -6342,28 +7349,31 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
mutex_unlock(&dir->log_mutex);
}
-/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
+/**
+ * Update the log after adding a new name for an inode.
*
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
*
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
int ret;
/*
@@ -6377,36 +7387,112 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
}
+ ret = 0;
+
+ /*
+ * If we are doing a rename (old_dir is not NULL) from a directory that
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
+ */
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ ret = join_running_log_trans(root);
+ /*
+ * At least one of the inodes was logged before, so this should
+ * not fail, but if it does, it's not serious, just bail out and
+ * mark the log for a full commit.
+ */
+ if (WARN_ON_ONCE(ret < 0))
+ goto out;
+ log_pinned = true;
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- return BTRFS_NEED_LOG_SYNC;
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
+
+ btrfs_free_path(path);
+ if (ret < 0)
+ goto out;
+ }
+
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..aed1e05e9879 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -12,12 +12,26 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+/* We can't use the tree log for whatever reason, force a transaction commit */
+#define BTRFS_LOG_FORCE_COMMIT (1)
+
struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
+ bool logging_new_delayed_dentries;
+ /* Indicate if the inode being logged was logged before. */
+ bool logged_before;
+ /* Tracks the last logged dir item/index key offset. */
+ u64 last_dir_item_offset;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
+ struct list_head conflict_inodes;
+ int num_conflict_inodes;
+ bool logging_conflict_inodes;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -26,8 +40,28 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
+ ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -49,17 +83,15 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- const char *name, int name_len,
- struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const char *name, int name_len,
+ struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
@@ -67,16 +99,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent);
#endif
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
new file mode 100644
index 000000000000..8a3a14686d3e
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.c
@@ -0,0 +1,929 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "tree-mod-log.h"
+#include "disk-io.h"
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 logical;
+ u64 seq;
+ enum btrfs_mod_log_op op;
+
+ /*
+ * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
+ * operations.
+ */
+ int slot;
+
+ /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
+ u64 generation;
+
+ /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+};
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+
+ return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct tree_mod_elem *tm;
+ u64 min_seq = BTRFS_SEQ_LAST;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ list_del(&elem->list);
+ elem->seq = 0;
+
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ } else {
+ struct btrfs_seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we cannot
+ * remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
+ }
+ min_seq = first->seq;
+ }
+
+ /*
+ * Anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = rb_entry(node, struct tree_mod_elem, node);
+ if (tm->seq >= min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Key order of the log:
+ * node/leaf start address -> sequence
+ *
+ * The 'start address' is the logical address of the *new* root node for root
+ * replace operations, or the logical address of the affected block for all
+ * other operations.
+ */
+static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+
+ lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
+
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = rb_entry(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->logical < tm->logical)
+ new = &((*new)->rb_left);
+ else if (cur->logical > tm->logical)
+ new = &((*new)->rb_right);
+ else if (cur->seq < tm->seq)
+ new = &((*new)->rb_left);
+ else if (cur->seq > tm->seq)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+ return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns true if it can. Otherwise, it
+ * returns false with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * write unlock fs_info::tree_mod_log_lock.
+ */
+static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return true;
+ if (eb && btrfs_header_level(eb) == 0)
+ return true;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return true;
+ }
+
+ return false;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return false;
+ if (eb && btrfs_header_level(eb) == 0)
+ return false;
+
+ return true;
+}
+
+static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
+ int slot,
+ enum btrfs_mod_log_op op,
+ gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return NULL;
+
+ tm->logical = eb->start;
+ if (op != BTRFS_MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+ RB_CLEAR_NODE(&tm->node);
+
+ return tm;
+}
+
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ if (!tm)
+ return -ENOMEM;
+
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
+ kfree(tm);
+ return 0;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ kfree(tm);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items)
+{
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int ret = 0;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = eb->start;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+ locked = true;
+
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ if (ret)
+ goto free_tms;
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+ kfree(tm);
+
+ return ret;
+}
+
+static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
+{
+ int i, j;
+ int ret;
+
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret) {
+ for (j = nritems - 1; j > i; j--)
+ rb_erase(&tm_list[j]->node,
+ &fs_info->tree_mod_log);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal)
+{
+ struct btrfs_fs_info *fs_info = old_root->fs_info;
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int ret = 0;
+ int i;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (log_removal && btrfs_header_level(old_root) > 0) {
+ nritems = btrfs_header_nritems(old_root);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(old_root, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+ }
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = new_root->start;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+
+ if (tm_list)
+ ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ if (!ret)
+ ret = tree_mod_log_insert(fs_info, tm);
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return ret;
+
+free_tms:
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
+ kfree(tm);
+
+ return ret;
+}
+
+static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq,
+ bool smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = rb_entry(node, struct tree_mod_elem, node);
+ if (cur->logical < start) {
+ node = node->rb_left;
+ } else if (cur->logical > start) {
+ node = node->rb_right;
+ } else if (cur->seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* We want the node with the highest seq */
+ if (found)
+ BUG_ON(found->seq > cur->seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->seq > min_seq) {
+ /* We want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->seq < cur->seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * This returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). Any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, true);
+}
+
+/*
+ * This returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). Any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, false);
+}
+
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items)
+{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
+ int ret = 0;
+ struct tree_mod_elem **tm_list = NULL;
+ struct tree_mod_elem **tm_list_add, **tm_list_rem;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return 0;
+
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm_list_add = tm_list;
+ tm_list_rem = tm_list + nr_items;
+ for (i = 0; i < nr_items; i++) {
+ tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ if (!tm_list_rem[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ if (!tm_list_add[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+ locked = true;
+
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
+ if (ret)
+ goto free_tms;
+ ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
+{
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int i;
+ int ret = 0;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ nritems = btrfs_header_nritems(eb);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+
+ ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+
+ return ret;
+}
+
+/*
+ * Returns the logical address of the oldest predecessor of the given root.
+ * Entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
+ u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = eb_root->start;
+ bool looped = false;
+
+ if (!time_seq)
+ return NULL;
+
+ /*
+ * The very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). This has the logical address
+ * of the *new* root, making it the very first operation that's logged
+ * for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return NULL;
+ /*
+ * If there are no tree operation for the oldest root, we simply
+ * return it. This should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * If there's an operation that's not a root replacement, we
+ * found the oldest version of our root. Normally, we'll find a
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ looped = true;
+ }
+
+ /* If there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. Then, all
+ * previous operations will be rewound (until we reach something older than
+ * time_seq).
+ */
+static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ read_lock(&fs_info->tree_mod_log_lock);
+ while (tm && tm->seq >= time_seq) {
+ /*
+ * All the operations are recorded with the operator used for
+ * the modification. As we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ fallthrough;
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case BTRFS_MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case BTRFS_MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case BTRFS_MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case BTRFS_MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case BTRFS_MOD_LOG_ROOT_REPLACE:
+ /*
+ * This operation is special. For roots, this must be
+ * handled explicitly before rewinding.
+ * For non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. In the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. We simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = rb_entry(next, struct tree_mod_elem, node);
+ if (tm->logical != first_tm->logical)
+ break;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+ btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ }
+
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
+ btrfs_tree_read_lock(eb_rewin);
+ tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb_rewin;
+}
+
+/*
+ * Rewind the state of @root's root node to the given @time_seq value.
+ * If there are no changes, the current root->root_node is returned. If anything
+ * changed in between, there's a fresh buffer allocated on which the rewind
+ * operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb = NULL;
+ struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
+ struct extent_buffer *old;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+ int level;
+
+ eb_root = btrfs_read_lock_root_node(root);
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (!tm)
+ return eb_root;
+
+ if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ level = old_root->level;
+ } else {
+ logical = eb_root->start;
+ level = btrfs_header_level(eb_root);
+ }
+
+ tm = tree_mod_log_search(fs_info, logical, time_seq);
+ if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ old = read_tree_block(fs_info, logical, root->root_key.objectid,
+ 0, level, NULL);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
+ btrfs_warn(fs_info,
+ "failed to read tree block %llu from get_old_root",
+ logical);
+ } else {
+ struct tree_mod_elem *tm2;
+
+ btrfs_tree_read_lock(old);
+ eb = btrfs_clone_extent_buffer(old);
+ /*
+ * After the lookup for the most recent tree mod operation
+ * above and before we locked and cloned the extent buffer
+ * 'old', a new tree mod log operation may have been added.
+ * So lookup for a more recent one to make sure the number
+ * of mod log operations we replay is consistent with the
+ * number of items we have in the cloned extent buffer,
+ * otherwise we can hit a BUG_ON when rewinding the extent
+ * buffer.
+ */
+ tm2 = tree_mod_log_search(fs_info, logical, time_seq);
+ btrfs_tree_read_unlock(old);
+ free_extent_buffer(old);
+ ASSERT(tm2);
+ ASSERT(tm2 == tm || tm2->seq > tm->seq);
+ if (!tm2 || tm2->seq < tm->seq) {
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ tm = tm2;
+ }
+ } else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ eb = alloc_dummy_extent_buffer(fs_info, logical);
+ } else {
+ eb = btrfs_clone_extent_buffer(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ }
+
+ if (!eb)
+ return NULL;
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, eb_root_owner);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
+ if (tm)
+ tree_mod_log_rewind(fs_info, eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+ struct extent_buffer *eb_root = btrfs_root_node(root);
+
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
+ level = tm->old_root.level;
+ else
+ level = btrfs_header_level(eb_root);
+
+ free_extent_buffer(eb_root);
+
+ return level;
+}
+
+/*
+ * Return the lowest sequence number in the tree modification log.
+ *
+ * Return the sequence number of the oldest tree modification log user, which
+ * corresponds to the lowest sequence number of all existing users. If there are
+ * no users it returns 0.
+ */
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
+{
+ u64 ret = 0;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct btrfs_seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ ret = elem->seq;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return ret;
+}
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
new file mode 100644
index 000000000000..12605d19621b
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef BTRFS_TREE_MOD_LOG_H
+#define BTRFS_TREE_MOD_LOG_H
+
+#include "ctree.h"
+
+/* Represents a tree mod log user. */
+struct btrfs_seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define BTRFS_SEQ_LAST ((u64)-1)
+
+enum btrfs_mod_log_op {
+ BTRFS_MOD_LOG_KEY_REPLACE,
+ BTRFS_MOD_LOG_KEY_ADD,
+ BTRFS_MOD_LOG_KEY_REMOVE,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ BTRFS_MOD_LOG_MOVE_KEYS,
+ BTRFS_MOD_LOG_ROOT_REPLACE,
+};
+
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal);
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags);
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq);
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items);
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items);
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 76b84f2397b1..b458452a1aaf 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -52,7 +52,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
offset = btrfs_item_ptr_offset(eb, slot);
ret = -ENOENT;
@@ -125,12 +125,11 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
- offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+ offset += btrfs_item_size(eb, slot) - sizeof(subid_le);
} else {
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
- ret, (unsigned long long)key.objectid,
- (unsigned long long)key.offset, type);
+ ret, key.objectid, key.offset, type);
goto out;
}
@@ -187,7 +186,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
@@ -209,7 +208,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
goto out;
}
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (item_size == sizeof(subid)) {
ret = btrfs_del_item(trans, uuid_root, path);
goto out;
@@ -246,9 +245,49 @@ out:
return ret;
}
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
- int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
- u64))
+/*
+ * Check if there's an matching subvolume for given UUID
+ *
+ * Return:
+ * 0 check succeeded, the entry is not outdated
+ * > 0 if the check failed, the caller should remove the entry
+ * < 0 if an error occurred
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+ u8 *uuid, u8 type, u64 subvolid)
+{
+ int ret = 0;
+ struct btrfs_root *subvol_root;
+
+ if (type != BTRFS_UUID_KEY_SUBVOL &&
+ type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto out;
+
+ subvol_root = btrfs_get_fs_root(fs_info, subvolid, true);
+ if (IS_ERR(subvol_root)) {
+ ret = PTR_ERR(subvol_root);
+ if (ret == -ENOENT)
+ ret = 1;
+ goto out;
+ }
+
+ switch (type) {
+ case BTRFS_UUID_KEY_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.received_uuid,
+ BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ }
+ btrfs_put_root(subvol_root);
+out:
+ return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
@@ -278,6 +317,10 @@ again_search_slot:
}
while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
leaf = path->nodes[0];
slot = path->slots[0];
@@ -288,7 +331,7 @@ again_search_slot:
goto skip;
offset = btrfs_item_ptr_offset(leaf, slot);
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
btrfs_warn(fs_info,
"uuid item with illegal size %lu!",
@@ -305,7 +348,8 @@ again_search_slot:
read_extent_buffer(leaf, &subid_le, offset,
sizeof(subid_le));
subid_cpu = le64_to_cpu(subid_le);
- ret = check_func(fs_info, uuid, key.type, subid_cpu);
+ ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
+ key.type, subid_cpu);
if (ret < 0)
goto out;
if (ret > 0) {
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
new file mode 100644
index 000000000000..ee00e33c309e
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,812 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code. The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type. These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ *
+ * Another important consideration is the fact that the Merkle tree data scales
+ * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
+ * ~1/127th the size) so for large files, writing the tree can be a lengthy
+ * operation. For that reason, we guard the whole enable verity operation
+ * (between begin_enable_verity and end_enable_verity) with an orphan item.
+ * Again, because the data can be pretty large, it's quite possible that we
+ * could run out of space writing it, so we try our best to handle errors by
+ * stopping and rolling back rather than aborting the victim transaction.
+ */
+
+#define MERKLE_START_ALIGN 65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode: inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+ u64 sz = inode->i_size;
+ u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+ if (rounded > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode: inode to drop items for
+ * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ * BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int count = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ /* 1 for the item being dropped */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /*
+ * Walk backwards through all the items until we find one that
+ * isn't from our key type or objectid
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = 0;
+ /* No more keys of this type, we're done */
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ /* No more keys of this type, we're done */
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ /*
+ * This shouldn't be a performance sensitive function because
+ * it's not used as part of truncate. If it ever becomes
+ * perf sensitive, change this to walk forward and bulk delete
+ * items
+ */
+ ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ count++;
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+ ret = count;
+ btrfs_end_transaction(trans);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode: inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ int ret;
+
+ ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+ ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode: inode to insert for
+ * @key_type: key type to insert
+ * @offset: item offset to insert at
+ * @src: source data to write
+ * @len: length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ const char *src, u64 len)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long copy_bytes;
+ unsigned long src_offset = 0;
+ void *data;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (len > 0) {
+ /* 1 for the new item being inserted */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ /*
+ * Insert 2K at a time mostly to be friendly for smaller leaf
+ * size filesystems
+ */
+ copy_bytes = min_t(u64, len, 2048);
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ break;
+ }
+
+ leaf = path->nodes[0];
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ write_extent_buffer(leaf, src + src_offset,
+ (unsigned long)data, copy_bytes);
+ offset += copy_bytes;
+ src_offset += copy_bytes;
+ len -= copy_bytes;
+
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode: inode to read items of
+ * @key_type: key type to read
+ * @offset: item offset to read from
+ * @dest: Buffer to read into. This parameter has slightly tricky
+ * semantics. If it is NULL, the function will not do any copying
+ * and will just return the size of all the items up to len bytes.
+ * If dest_page is passed, then the function will kmap_local the
+ * page and ignore dest, but it must still be non-NULL to avoid the
+ * counting-only behavior.
+ * @len: length in bytes to read
+ * @dest_page: copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree. This returns the number of
+ * bytes read or < 0 for errors. We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length. Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ char *dest, u64 len, struct page *dest_page)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 item_end;
+ u64 copy_end;
+ int copied = 0;
+ u32 copy_offset;
+ unsigned long copy_bytes;
+ unsigned long dest_offset = 0;
+ void *data;
+ char *kaddr = dest;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (dest_page)
+ path->reada = READA_FORWARD;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (len > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ item_end = btrfs_item_size(leaf, path->slots[0]) + key.offset;
+
+ if (copied > 0) {
+ /*
+ * Once we've copied something, we want all of the items
+ * to be sequential
+ */
+ if (key.offset != offset)
+ break;
+ } else {
+ /*
+ * Our initial offset might be in the middle of an
+ * item. Make sure it all makes sense.
+ */
+ if (key.offset > offset)
+ break;
+ if (item_end <= offset)
+ break;
+ }
+
+ /* desc = NULL to just sum all the item lengths */
+ if (!dest)
+ copy_end = item_end;
+ else
+ copy_end = min(offset + len, item_end);
+
+ /* Number of bytes in this item we want to copy */
+ copy_bytes = copy_end - offset;
+
+ /* Offset from the start of item for copying */
+ copy_offset = offset - key.offset;
+
+ if (dest) {
+ if (dest_page)
+ kaddr = kmap_local_page(dest_page);
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ read_extent_buffer(leaf, kaddr + dest_offset,
+ (unsigned long)data + copy_offset,
+ copy_bytes);
+
+ if (dest_page)
+ kunmap_local(kaddr);
+ }
+
+ offset += copy_bytes;
+ dest_offset += copy_bytes;
+ len -= copy_bytes;
+ copied += copy_bytes;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ /*
+ * We've reached the last slot in this leaf and we need
+ * to go to the next leaf.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+out:
+ btrfs_free_path(path);
+ if (!ret)
+ ret = copied;
+ return ret;
+}
+
+/*
+ * Delete an fsverity orphan
+ *
+ * @trans: transaction to do the delete in
+ * @inode: inode to orphan
+ *
+ * Capture verity orphan specific logic that is repeated in the couple places
+ * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
+ * with 0 links.
+ *
+ * Returns zero on success or a negative error code on failure.
+ */
+static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ /*
+ * If the inode has no links, it is either already unlinked, or was
+ * created with O_TMPFILE. In either case, it should have an orphan from
+ * that other operation. Rather than reference count the orphans, we
+ * simply ignore them here, because we only invoke the verity path in
+ * the orphan logic when i_nlink is 1.
+ */
+ if (!inode->vfs_inode.i_nlink)
+ return 0;
+
+ ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode: inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+ truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ ret = btrfs_drop_verity_items(inode);
+ if (ret) {
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to drop verity items in rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to start transaction in verity rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+ inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ ret = del_orphan(trans, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ if (trans)
+ btrfs_end_transaction(trans);
+ return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode: inode to be marked as verity
+ * @desc: contents of the verity descriptor to write (not NULL)
+ * @desc_size: size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - delete the orphan item
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+ size_t desc_size)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_verity_descriptor_item item;
+ int ret;
+
+ /* Write out the descriptor item */
+ memset(&item, 0, sizeof(item));
+ btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (const char *)&item, sizeof(item));
+ if (ret)
+ goto out;
+
+ /* Write out the descriptor itself */
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ desc, desc_size);
+ if (ret)
+ goto out;
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto end_trans;
+ ret = del_orphan(trans, inode);
+ if (ret)
+ goto end_trans;
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+ btrfs_end_transaction(trans);
+out:
+ return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp: file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items, add
+ * an orphan and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+ return -EBUSY;
+
+ /*
+ * This should almost never do anything, but theoretically, it's
+ * possible that we failed to enable verity on a file, then were
+ * interrupted or failed while rolling back, failed to cleanup the
+ * orphan, and finally attempt to enable verity again.
+ */
+ ret = btrfs_drop_verity_items(inode);
+ if (ret)
+ return ret;
+
+ /* 1 for the orphan item */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (!ret)
+ set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_end_transaction(trans);
+
+ return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp: file we are finishing enabling verity on
+ * @desc: verity descriptor to write out (NULL in error conditions)
+ * @desc_size: size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size: size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ int ret = 0;
+ int rollback_ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (desc == NULL)
+ goto rollback;
+
+ ret = finish_verity(inode, desc, desc_size);
+ if (ret)
+ goto rollback;
+ return ret;
+
+rollback:
+ rollback_ret = rollback_verity(inode);
+ if (rollback_ret)
+ btrfs_err(inode->root->fs_info,
+ "failed to rollback verity items: %d", rollback_ret);
+ return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode: inode to get the descriptor of
+ * @buf: output buffer for the descriptor contents
+ * @buf_size: size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
+{
+ u64 true_size;
+ int ret = 0;
+ struct btrfs_verity_descriptor_item item;
+
+ memset(&item, 0, sizeof(item));
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (char *)&item, sizeof(item), NULL);
+ if (ret < 0)
+ return ret;
+
+ if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ return -EUCLEAN;
+
+ true_size = btrfs_stack_verity_descriptor_size(&item);
+ if (true_size > INT_MAX)
+ return -EUCLEAN;
+
+ if (buf_size == 0)
+ return true_size;
+ if (buf_size < true_size)
+ return -ERANGE;
+
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ buf, buf_size, NULL);
+ if (ret < 0)
+ return ret;
+ if (ret != true_size)
+ return -EIO;
+
+ return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode: inode to read a merkle tree page for
+ * @index: page index relative to the start of the merkle tree
+ * @num_ra_pages: number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index,
+ unsigned long num_ra_pages)
+{
+ struct page *page;
+ u64 off = (u64)index << PAGE_SHIFT;
+ loff_t merkle_pos = merkle_file_pos(inode);
+ int ret;
+
+ if (merkle_pos < 0)
+ return ERR_PTR(merkle_pos);
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+ return ERR_PTR(-EFBIG);
+ index += merkle_pos >> PAGE_SHIFT;
+again:
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (page) {
+ if (PageUptodate(page))
+ return page;
+
+ lock_page(page);
+ /*
+ * We only insert uptodate pages, so !Uptodate has to be
+ * an error
+ */
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ unlock_page(page);
+ return page;
+ }
+
+ page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Merkle item keys are indexed from byte 0 in the merkle tree.
+ * They have the form:
+ *
+ * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+ */
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+ page_address(page), PAGE_SIZE, page);
+ if (ret < 0) {
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+ if (ret < PAGE_SIZE)
+ memzero_page(page, ret, PAGE_SIZE - ret);
+
+ SetPageUptodate(page);
+ ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+ if (!ret) {
+ /* Inserted and ready for fsverity */
+ unlock_page(page);
+ } else {
+ put_page(page);
+ /* Did someone race us into inserting this page? */
+ if (ret == -EEXIST)
+ goto again;
+ page = ERR_PTR(ret);
+ }
+ return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode: inode to write a Merkle tree block for
+ * @buf: Merkle tree data block to write
+ * @index: index of the block in the Merkle tree
+ * @log_blocksize: log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ u64 off = index << log_blocksize;
+ u64 len = 1ULL << log_blocksize;
+ loff_t merkle_pos = merkle_file_pos(inode);
+
+ if (merkle_pos < 0)
+ return merkle_pos;
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+ return -EFBIG;
+
+ return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+ off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+ .begin_enable_verity = btrfs_begin_enable_verity,
+ .end_enable_verity = btrfs_end_enable_verity,
+ .get_verity_descriptor = btrfs_get_verity_descriptor,
+ .read_merkle_tree_page = btrfs_read_merkle_tree_page,
+ .write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9cfc668f91f4..635f45f1a2ef 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4,9 +4,9 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
@@ -14,6 +14,7 @@
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
+#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
@@ -31,13 +32,20 @@
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
+#include "zoned.h"
+
+static struct bio_set btrfs_bioset;
+
+#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10 | \
+ BTRFS_BLOCK_GROUP_RAID56_MASK)
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
+ .devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
@@ -102,7 +110,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
- .devs_min = 2,
+ .devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
@@ -152,6 +160,20 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ if (!profile)
+ return BTRFS_RAID_SINGLE;
+
+ return BTRFS_BG_FLAG_TO_INDEX(profile);
+}
+
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -162,6 +184,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags)
return btrfs_raid_array[index].raid_name;
}
+int btrfs_nr_parity_stripes(u64 type)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
+
+ return btrfs_raid_array[index].nparity;
+}
+
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
@@ -218,13 +247,12 @@ out_overflow:;
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
- int mirror_num, int need_raid_map);
+ enum btrfs_map_op op, u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map);
/*
* Device locking
@@ -246,7 +274,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*
* global::fs_devs - add, remove, updates to the global list
*
- * does not protect: manipulation of the fs_devices::devices list!
+ * does not protect: manipulation of the fs_devices::devices list in general
+ * but in mount context it could be used to exclude list modifications by eg.
+ * scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
@@ -259,6 +289,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
+ * Is not required at mount and close times, because our device list is
+ * protected by the uuid_mutex at that point.
+ *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -281,14 +314,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* ============
*
* uuid_mutex
- * volume_mutex
- * device_list_mutex
- * chunk_mutex
- * balance_mutex
+ * device_list_mutex
+ * chunk_mutex
+ * balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -314,11 +346,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
@@ -352,6 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -368,7 +401,7 @@ void btrfs_free_device(struct btrfs_device *device)
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
- bio_put(device->flush_bio);
+ btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -397,43 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
}
}
-/*
- * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
- * Returned struct is not linked onto any lists and must be destroyed using
- * btrfs_free_device.
- */
-static struct btrfs_device *__alloc_device(void)
-{
- struct btrfs_device *dev;
-
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- INIT_LIST_HEAD(&dev->dev_list);
- INIT_LIST_HEAD(&dev->dev_alloc_list);
- INIT_LIST_HEAD(&dev->post_commit_list);
-
- atomic_set(&dev->reada_in_flight, 0);
- atomic_set(&dev->dev_stats_ccnt, 0);
- btrfs_device_data_ordered_init(dev);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
-
- return dev;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -500,7 +496,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
- struct buffer_head **bh)
+ struct btrfs_super_block **disk_super)
{
int ret;
@@ -512,16 +508,16 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
if (flush)
- filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+ sync_blockdev(*bdev);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
goto error;
}
invalidate_bdev(*bdev);
- *bh = btrfs_read_dev_super(*bdev);
- if (IS_ERR(*bh)) {
- ret = PTR_ERR(*bh);
+ *disk_super = btrfs_read_dev_super(*bdev);
+ if (IS_ERR(*disk_super)) {
+ ret = PTR_ERR(*disk_super);
blkdev_put(*bdev, flags);
goto error;
}
@@ -530,40 +526,31 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
error:
*bdev = NULL;
- *bh = NULL;
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
-{
- int found;
-
- rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
- rcu_read_unlock();
-
- return found == 0;
-}
-
-/*
- * Search and remove all stale (devices which are not mounted) devices.
+/**
+ * Search and remove all stale devices (which are not mounted).
* When both inputs are NULL, it will search and release all stale devices.
- * path: Optional. When provided will it release all unmounted devices
- * matching this path only.
- * skip_dev: Optional. Will skip this device when searching for the stale
+ *
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
+ * @skip_device: Optional. Will skip this device when searching for the stale
* devices.
- * Return: 0 for success or if @path is NULL.
- * -EBUSY if @path is a mounted device.
- * -ENOENT if @path does not match any device in the list.
+ *
+ * Return: 0 for success or if @devt is 0.
+ * -EBUSY if @devt is a mounted device.
+ * -ENOENT if @devt does not match any device in the list.
*/
-static int btrfs_free_stale_devices(const char *path,
- struct btrfs_device *skip_device)
+static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
int ret = 0;
- if (path)
+ lockdep_assert_held(&uuid_mutex);
+
+ if (devt)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
@@ -573,13 +560,11 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
- if (path && ret != 0)
+ if (devt && ret != 0)
ret = -EBUSY;
break;
}
@@ -590,8 +575,6 @@ static int btrfs_free_stale_devices(const char *path,
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -605,13 +588,16 @@ static int btrfs_free_stale_devices(const char *path,
return ret;
}
+/*
+ * This is only used on mount, and we are protected from competing things
+ * messing with our fs_devices by the uuid_mutex, thus we do not need the
+ * fs_devices->device_list_mutex here.
+ */
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
{
- struct request_queue *q;
struct block_device *bdev;
- struct buffer_head *bh;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -622,17 +608,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh);
+ &bdev, &disk_super);
if (ret)
return ret;
- disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
- goto error_brelse;
+ goto error_free_page;
if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
- goto error_brelse;
+ goto error_free_page;
device->generation = btrfs_super_generation(disk_super);
@@ -641,7 +626,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
pr_err(
"BTRFS: Invalid seeding and uuid-changed device detected\n");
- goto error_brelse;
+ goto error_free_page;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
@@ -653,8 +638,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- q = bdev_get_queue(bdev);
- if (!blk_queue_nonrot(q))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -667,12 +651,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
fs_devices->rw_devices++;
list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
return 0;
-error_brelse:
- brelse(bh);
+error_free_page:
+ btrfs_release_disk_super(disk_super);
blkdev_put(bdev, flags);
return -EINVAL;
@@ -709,7 +693,7 @@ static struct btrfs_fs_devices *find_fsid_changed(
/*
* Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but curently device didn't
+ * multiple successful changes of FSID but currently device didn't
* observe it. Meaning our fsid will be different than theirs. We need
* to handle two subcases :
* 1 - The fs still continues to have different METADATA/FSID uuids.
@@ -778,11 +762,17 @@ static noinline struct btrfs_device *device_list_add(const char *path,
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_t path_devt;
+ int error;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+ error = lookup_bdev(path, &path_devt);
+ if (error)
+ return ERR_PTR(error);
+
if (fsid_change_in_progress) {
if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
@@ -814,9 +804,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
+ struct btrfs_dev_lookup_args args = {
+ .devid = devid,
+ .uuid = disk_super->dev_item.uuid,
+ };
+
mutex_lock(&fs_devices->device_list_mutex);
- device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL, false);
+ device = btrfs_find_device(fs_devices, &args);
/*
* If this disk has been pulled into an fs devices created by
@@ -861,6 +855,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
+ device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
@@ -921,30 +916,27 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
- struct block_device *path_bdev;
-
- path_bdev = lookup_bdev(path);
- if (IS_ERR(path_bdev)) {
+ if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_CAST(path_bdev);
- }
-
- if (device->bdev != path_bdev) {
- bdput(path_bdev);
- mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(device->fs_info,
- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ btrfs_warn_in_rcu(NULL,
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- bdput(path_bdev);
- btrfs_info_in_rcu(device->fs_info,
- "device fsid %pU devid %llu moved old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ btrfs_info_in_rcu(NULL,
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -958,6 +950,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
+ device->devt = path_devt;
}
/*
@@ -985,11 +978,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *orig_dev;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
- mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
@@ -1017,58 +1011,54 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
rcu_assign_pointer(device->name, name);
}
+ if (orig_dev->zone_info) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = btrfs_clone_dev_zone_info(orig_dev);
+ if (!zone_info) {
+ btrfs_free_device(device);
+ ret = -ENOMEM;
+ goto error;
+ }
+ device->zone_info = zone_info;
+ }
+
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
- mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
- mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ &device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state) &&
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
- if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
- /*
- * In the first step, keep the device which has
- * the correct fsid and the devid that is used
- * for the dev_replace procedure.
- * In the second step, the dev_replace state is
- * read from the device tree and it is known
- * whether the procedure is really active or
- * not, which means whether this device is
- * used or whether it should be removed.
- */
- if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state)) {
- continue;
- }
- }
+ /*
+ * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
+ * in btrfs_init_dev_replace() so just continue.
+ */
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ continue;
+
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
@@ -1077,21 +1067,31 @@ again:
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state))
- fs_devices->rw_devices--;
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, &latest_dev);
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
mutex_unlock(&uuid_mutex);
}
@@ -1119,8 +1119,13 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->rw_devices--;
}
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
if (device->bdev) {
@@ -1128,60 +1133,67 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
extent_io_tree_release(&device->alloc_state);
+ /*
+ * Reset the flush error record. We might have a transient flush error
+ * in this mount, and if so we aborted the current transaction and set
+ * the fs to an error state, guaranteeing no super blocks can be further
+ * committed. However that error might be transient and if we unmount the
+ * filesystem and mount it again, we should allow the mount to succeed
+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
+ * filesystem again we still get flush errors, then we will again abort
+ * any transaction and set the error state, guaranteeing no commits of
+ * unsafe super blocks.
+ */
+ device->last_flush_error = 0;
+
/* Verify the device is back in a pristine state */
ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
ASSERT(list_empty(&device->dev_alloc_list));
ASSERT(list_empty(&device->post_commit_list));
- ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
+ lockdep_assert_held(&uuid_mutex);
+
if (--fs_devices->opened > 0)
- return 0;
+ return;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = false;
-
- return 0;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1189,33 +1201,40 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
- int ret = 0;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
- if (fs_devices->open_devices == 0) {
- ret = -EINVAL;
- goto out;
- }
+ if (fs_devices->open_devices == 0)
+ return -EINVAL;
+
fs_devices->opened = 1;
- fs_devices->latest_bdev = latest_dev->bdev;
+ fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
-out:
- return ret;
+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+
+ return 0;
}
-static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int devid_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
- struct btrfs_device *dev1, *dev2;
+ const struct btrfs_device *dev1, *dev2;
dev1 = list_entry(a, struct btrfs_device, dev_list);
dev2 = list_entry(b, struct btrfs_device, dev_list);
@@ -1233,8 +1252,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int ret;
lockdep_assert_held(&uuid_mutex);
+ /*
+ * The device_list_mutex cannot be taken here in case opening the
+ * underlying device takes further locks like open_mutex.
+ *
+ * We also don't need the lock here as this is called during mount and
+ * exclusion is provided by uuid_mutex
+ */
- mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
@@ -1242,68 +1267,67 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
-static void btrfs_release_disk_super(struct page *page)
+void btrfs_release_disk_super(struct btrfs_super_block *super)
{
- kunmap(page);
+ struct page *page = virt_to_page(super);
+
put_page(page);
}
-static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
- struct page **page,
- struct btrfs_super_block **disk_super)
+static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ u64 bytenr, u64 bytenr_orig)
{
+ struct btrfs_super_block *disk_super;
+ struct page *page;
void *p;
pgoff_t index;
/* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
- return 1;
+ if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
+ return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
- if (sizeof(**disk_super) > PAGE_SIZE)
- return 1;
+ if (sizeof(*disk_super) > PAGE_SIZE)
+ return ERR_PTR(-EINVAL);
/* make sure our super doesn't straddle pages on disk */
index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
- return 1;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
+ return ERR_PTR(-EINVAL);
/* pull in the page with our super */
- *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
- index, GFP_KERNEL);
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
- if (IS_ERR_OR_NULL(*page))
- return 1;
+ if (IS_ERR(page))
+ return ERR_CAST(page);
- p = kmap(*page);
+ p = page_address(page);
/* align our pointer to the offset of the super block */
- *disk_super = p + offset_in_page(bytenr);
+ disk_super = p + offset_in_page(bytenr);
- if (btrfs_super_bytenr(*disk_super) != bytenr ||
- btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(*page);
- return 1;
+ if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(p);
+ return ERR_PTR(-EINVAL);
}
- if ((*disk_super)->label[0] &&
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
- return 0;
+ return disk_super;
}
-int btrfs_forget_devices(const char *path)
+int btrfs_forget_devices(dev_t devt)
{
int ret;
mutex_lock(&uuid_mutex);
- ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ ret = btrfs_free_stale_devices(devt, NULL);
mutex_unlock(&uuid_mutex);
return ret;
@@ -1321,8 +1345,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
- struct page *page;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
+ int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1332,25 +1356,30 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
- bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
- device = ERR_PTR(-EINVAL);
+ bytenr_orig = btrfs_sb_offset(0);
+ ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ if (ret) {
+ device = ERR_PTR(ret);
goto error_bdev_put;
}
- device = device_list_add(path, disk_super, &new_device_added);
- if (!IS_ERR(device)) {
- if (new_device_added)
- btrfs_free_stale_devices(path, device);
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+ if (IS_ERR(disk_super)) {
+ device = ERR_CAST(disk_super);
+ goto error_bdev_put;
}
- btrfs_release_disk_super(page);
+ device = device_list_add(path, disk_super, &new_device_added);
+ if (!IS_ERR(device) && new_device_added)
+ btrfs_free_stale_devices(device->devt, device);
+
+ btrfs_release_disk_super(disk_super);
error_bdev_put:
blkdev_put(bdev, flags);
@@ -1383,6 +1412,120 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
return false;
}
+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
+{
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ /*
+ * We don't care about the starting region like regular
+ * allocator, because we anyway use/reserve the first two zones
+ * for superblock logging.
+ */
+ return ALIGN(start, device->zone_info->zone_size);
+ default:
+ BUG();
+ }
+}
+
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+ u64 *hole_start, u64 *hole_size,
+ u64 num_bytes)
+{
+ u64 zone_size = device->zone_info->zone_size;
+ u64 pos;
+ int ret;
+ bool changed = false;
+
+ ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+ while (*hole_size > 0) {
+ pos = btrfs_find_allocatable_zones(device, *hole_start,
+ *hole_start + *hole_size,
+ num_bytes);
+ if (pos != *hole_start) {
+ *hole_size = *hole_start + *hole_size - pos;
+ *hole_start = pos;
+ changed = true;
+ if (*hole_size < num_bytes)
+ break;
+ }
+
+ ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+ /* Range is ensured to be empty */
+ if (!ret)
+ return changed;
+
+ /* Given hole range was invalid (outside of device) */
+ if (ret == -ERANGE) {
+ *hole_start += *hole_size;
+ *hole_size = 0;
+ return true;
+ }
+
+ *hole_start += zone_size;
+ *hole_size -= zone_size;
+ changed = true;
+ }
+
+ return changed;
+}
+
+/**
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
+ * @device: the device which we have the hole
+ * @hole_start: starting position of the hole
+ * @hole_size: the size of the hole
+ * @num_bytes: the size of the free space that we need
+ *
+ * This function may modify @hole_start and @hole_size to reflect the suitable
+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
+ */
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
+ u64 *hole_size, u64 num_bytes)
+{
+ bool changed = false;
+ u64 hole_end = *hole_start + *hole_size;
+
+ for (;;) {
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ if (dev_extent_hole_check_zoned(device, hole_start,
+ hole_size, num_bytes)) {
+ changed = true;
+ /*
+ * The changed hole can contain pending extent.
+ * Loop again to check that.
+ */
+ continue;
+ }
+ break;
+ default:
+ BUG();
+ }
+
+ break;
+ }
+
+ return changed;
+}
/*
* find_free_dev_extent_start - find free space in the specified device
@@ -1409,7 +1552,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
* check to ensure dev extents are not double allocated.
* This makes the function safe to allocate dev extents but may not report
* correct usable device space, as device extent freed in current transaction
- * is not reported as avaiable.
+ * is not reported as available.
*/
static int find_free_dev_extent_start(struct btrfs_device *device,
u64 num_bytes, u64 search_start, u64 *start,
@@ -1429,12 +1572,10 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
int slot;
struct extent_buffer *l;
- /*
- * We don't want to overwrite the superblock on the drive nor any area
- * used by the boot loader (grub for example), so we make sure to start
- * at an offset of at least 1MB.
- */
- search_start = max_t(u64, search_start, SZ_1M);
+ search_start = dev_extent_search_start(device, search_start);
+
+ WARN_ON(device->zone_info &&
+ !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
path = btrfs_alloc_path();
if (!path)
@@ -1458,14 +1599,9 @@ again:
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
- if (ret > 0) {
- ret = btrfs_previous_item(root, path, key.objectid, key.type);
- if (ret < 0)
- goto out;
- }
while (1) {
l = path->nodes[0];
@@ -1492,18 +1628,8 @@ again:
if (key.offset > search_start) {
hole_size = key.offset - search_start;
-
- /*
- * Have to check before we set max_hole_start, otherwise
- * we could end up sending back this offset anyway.
- */
- if (contains_pending_extent(device, &search_start,
- hole_size)) {
- if (key.offset >= search_start)
- hole_size = key.offset - search_start;
- else
- hole_size = 0;
- }
+ dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes);
if (hole_size > max_hole_size) {
max_hole_start = search_start;
@@ -1542,8 +1668,8 @@ next:
*/
if (search_end > search_start) {
hole_size = search_end - search_start;
-
- if (contains_pending_extent(device, &search_start, hole_size)) {
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes)) {
btrfs_release_path(path);
goto again;
}
@@ -1616,61 +1742,14 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
goto out;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to remove dev extent item");
- } else {
+ if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
- }
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 chunk_offset, u64 start, u64 num_bytes)
-{
- int ret;
- struct btrfs_path *path;
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_dev_extent *extent;
- struct extent_buffer *leaf;
- struct btrfs_key key;
-
- WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
- WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = device->devid;
- key.offset = start;
- key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(*extent));
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_dev_extent);
- btrfs_set_dev_extent_chunk_tree(leaf, extent,
- BTRFS_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_objectid(leaf, extent,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
- btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
@@ -1760,8 +1839,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, true);
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
+ btrfs_trans_release_chunk_metadata(trans);
if (ret)
goto out;
@@ -1799,58 +1880,52 @@ out:
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
*/
-static void update_dev_time(const char *path_name)
+static void update_dev_time(const char *device_path)
{
- struct file *filp;
+ struct path path;
+ struct timespec64 now;
+ int ret;
- filp = filp_open(path_name, O_RDWR, 0);
- if (IS_ERR(filp))
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+ if (ret)
return;
- file_update_time(filp);
- filp_close(filp, NULL);
+
+ now = current_time(d_inode(path.dentry));
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+ path_put(&path);
}
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret) {
if (ret > 0)
ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
-
out:
btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -1878,12 +1953,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
- if (num_devices < btrfs_raid_array[i].devs_min) {
- int ret = btrfs_raid_array[i].mindev_error;
-
- if (ret)
- return ret;
- }
+ if (num_devices < btrfs_raid_array[i].devs_min)
+ return btrfs_raid_array[i].mindev_error;
}
return 0;
@@ -1905,30 +1976,27 @@ static struct btrfs_device * btrfs_find_next_active_device(
}
/*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
- if (fs_info->fs_devices->latest_bdev == device->bdev)
- fs_info->fs_devices->latest_bdev = next_device->bdev;
+ if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+ fs_info->fs_devices->latest_dev = next_device;
}
/*
@@ -1949,52 +2017,100 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
{
+ struct btrfs_super_block *disk_super;
+ int copy_num;
+
+ if (!bdev)
+ return;
+
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
+ struct page *page;
+ int ret;
+
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
+ if (IS_ERR(disk_super))
+ continue;
+
+ if (bdev_is_zoned(bdev)) {
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ continue;
+ }
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+
+ page = virt_to_page(disk_super);
+ set_page_dirty(page);
+ lock_page(page);
+ /* write_on_page() unlocks the page */
+ ret = write_one_page(page);
+ if (ret)
+ btrfs_warn(fs_info,
+ "error clearing superblock number %d (%d)",
+ copy_num, ret);
+ btrfs_release_disk_super(disk_super);
+
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+}
+
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode)
+{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
-
- device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+ return ret;
- if (IS_ERR(device)) {
- if (PTR_ERR(device) == -ENOENT &&
- strcmp(device_path, "missing") == 0)
+ device = btrfs_find_device(fs_info->fs_devices, args);
+ if (!device) {
+ if (args->missing)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
- ret = PTR_ERR(device);
- goto out;
+ ret = -ENOENT;
+ return ret;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ return -ETXTBSY;
}
- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2003,20 +2119,26 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2034,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
- * its own fs_devices listed under the fs_devices->seed.
+ * its own fs_devices listed under the fs_devices->seed_list.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
@@ -2054,7 +2176,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_rm_device_link(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2062,32 +2184,45 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_devices->device_list_mutex);
/*
- * at this point, the device is zero sized and detached from
- * the devices list. All that's left is to zero out the old
- * supers and free the device.
+ * At this point, the device is zero sized and detached from the
+ * devices list. All that's left is to zero out the old supers and
+ * free the device.
+ *
+ * We cannot call btrfs_close_bdev() here because we're holding the sb
+ * write lock, and blkdev_put() will pull in the ->open_mutex on the
+ * block device and it's dependencies. Instead just flush the device
+ * and let the caller do the final blkdev_put.
*/
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
- btrfs_scratch_superblocks(device->bdev, device->name->str);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+ btrfs_scratch_superblocks(fs_info, device->bdev,
+ device->name->str);
+ if (device->bdev) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+ }
- btrfs_close_bdev(device);
+ *bdev = device->bdev;
+ *mode = device->mode;
synchronize_rcu();
btrfs_free_device(device);
- if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
- close_fs_devices(cur_devices);
+ /*
+ * This can happen if cur_devices is the private seed devices list. We
+ * cannot call close_fs_devices() here because it expects the uuid_mutex
+ * to be held, but in fact we don't need that for the private
+ * seed_devices, we can simply decrement cur_devices->opened and then
+ * remove it from our list and free the fs_devices.
+ */
+ if (cur_devices->num_devices == 0) {
+ list_del_init(&cur_devices->seed_list);
+ ASSERT(cur_devices->opened == 1);
+ cur_devices->opened--;
free_fs_devices(cur_devices);
}
-out:
- mutex_unlock(&uuid_mutex);
+ ret = btrfs_commit_transaction(trans);
+
return ret;
error_undo:
@@ -2098,7 +2233,7 @@ error_undo:
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
@@ -2130,13 +2265,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2144,8 +2275,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2154,18 +2283,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
@@ -2174,7 +2296,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2187,110 +2309,139 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * The update_dev_time() with in btrfs_scratch_superblocks()
- * may lead to a call to btrfs_show_devname() which will try
- * to hold device_list_mutex. And here this device
- * is already out of device list, so we don't have to hold
- * the device_list_mutex lock.
- */
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
+ tgtdev->name->str);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
btrfs_free_device(tgtdev);
}
-static struct btrfs_device *btrfs_find_device_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info: the filesystem
+ * @args: the args to populate
+ * @path: the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid. This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks. The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path)
{
- int ret = 0;
struct btrfs_super_block *disk_super;
- u64 devid;
- u8 *dev_uuid;
struct block_device *bdev;
- struct buffer_head *bh;
- struct btrfs_device *device;
+ int ret;
- ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &bh);
- if (ret)
- return ERR_PTR(ret);
- disk_super = (struct btrfs_super_block *)bh->b_data;
- devid = btrfs_stack_device_id(&disk_super->dev_item);
- dev_uuid = disk_super->dev_item.uuid;
+ if (!path || !path[0])
+ return -EINVAL;
+ if (!strcmp(path, "missing")) {
+ args->missing = true;
+ return 0;
+ }
+
+ args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+ args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+ if (!args->uuid || !args->fsid) {
+ btrfs_put_dev_args_from_path(args);
+ return -ENOMEM;
+ }
+
+ ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+ &bdev, &disk_super);
+ if (ret) {
+ btrfs_put_dev_args_from_path(args);
+ return ret;
+ }
+
+ args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+ memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid, true);
+ memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
else
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid, true);
-
- brelse(bh);
- if (!device)
- device = ERR_PTR(-ENOENT);
+ memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+ btrfs_release_disk_super(disk_super);
blkdev_put(bdev, FMODE_READ);
- return device;
+ return 0;
}
/*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
*/
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+ kfree(args->uuid);
+ kfree(args->fsid);
+ args->uuid = NULL;
+ args->fsid = NULL;
+}
+
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *device;
+ int ret;
if (devid) {
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL, true);
+ args.devid = devid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
- if (!device_path || !device_path[0])
- return ERR_PTR(-EINVAL);
-
- if (strcmp(device_path, "missing") == 0) {
- /* Find first missing device */
- list_for_each_entry(device, &fs_info->fs_devices->devices,
- dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state) && !device->bdev)
- return device;
- }
+ ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+ if (ret)
+ return ERR_PTR(ret);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
+ btrfs_put_dev_args_from_path(&args);
+ if (!device)
return ERR_PTR(-ENOENT);
- }
-
- return btrfs_find_device_by_path(fs_info, device_path);
+ return device;
}
-/*
- * does all the dirty work required for changing file system's UUID.
- */
-static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
+static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
- struct btrfs_super_block *disk_super = fs_info->super_copy;
- struct btrfs_device *device;
- u64 super_flags;
lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
- return PTR_ERR(seed_devices);
+ return seed_devices;
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
- return PTR_ERR(old_devices);
+ return old_devices;
}
list_add(&old_devices->fs_list, &fs_uuids);
@@ -2301,33 +2452,60 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
- mutex_lock(&fs_devices->device_list_mutex);
+ return seed_devices;
+}
+
+/*
+ * Splice seed devices into the sprout fs_devices.
+ * Generate a new fsid for the sprouted read-write filesystem.
+ */
+static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *seed_devices)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct btrfs_device *device;
+ u64 super_flags;
+
+ /*
+ * We are updating the fsid, the thread leading to device_list_add()
+ * could race, so uuid_mutex is needed.
+ */
+ lockdep_assert_held(&uuid_mutex);
+
+ /*
+ * The threads listed below may traverse dev_list but can do that without
+ * device_list_mutex:
+ * - All device ops and balance - as we are in btrfs_exclop_start.
+ * - Various dev_list readers - are using RCU.
+ * - btrfs_ioctl_fitrim() - is using RCU.
+ *
+ * For-read threads as below are using device_list_mutex:
+ * - Readonly scrub btrfs_scrub_dev()
+ * - Readonly scrub btrfs_scrub_progress()
+ * - btrfs_get_dev_stats()
+ */
+ lockdep_assert_held(&fs_devices->device_list_mutex);
+
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
- fs_devices->seed = seed_devices;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
- mutex_unlock(&fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
-
- return 0;
}
/*
@@ -2335,6 +2513,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
@@ -2344,7 +2523,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- u64 devid;
int ret;
path = btrfs_alloc_path();
@@ -2356,7 +2534,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
+ btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0)
goto error;
@@ -2381,13 +2561,14 @@ next_slot:
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
- devid = btrfs_device_id(leaf, dev_item);
+ args.devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
+ device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2408,18 +2589,18 @@ error:
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
- struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *seed_devices;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
- int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool seeding_dev = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2429,24 +2610,29 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
- seeding_dev = 1;
+ seeding_dev = true;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2462,24 +2648,31 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_assign_pointer(device->name, name);
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error_free_device;
+
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ goto error_free_device;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto error_free_device;
+ goto error_free_zone;
}
- q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
- device->total_bytes = round_down(i_size_read(bdev->bd_inode),
- fs_info->sectorsize);
+ device->total_bytes =
+ round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
- device->fs_info = fs_info;
- device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
@@ -2487,17 +2680,26 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- sb->s_flags &= ~SB_RDONLY;
- ret = btrfs_prepare_sprout(fs_info);
- if (ret) {
+ btrfs_clear_sb_rdonly(sb);
+
+ /* GFP_KERNEL allocation must not be under device_list_mutex */
+ seed_devices = btrfs_init_sprout(fs_info);
+ if (IS_ERR(seed_devices)) {
+ ret = PTR_ERR(seed_devices);
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
}
+ mutex_lock(&fs_devices->device_list_mutex);
+ if (seeding_dev) {
+ btrfs_setup_sprout(fs_info, seed_devices);
+ btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+ device);
+ }
+
device->fs_devices = fs_devices;
- mutex_lock(&fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
@@ -2509,7 +2711,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(q))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2521,9 +2723,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
- /* add sysfs device entry */
- btrfs_sysfs_add_device_link(fs_devices, device);
-
/*
* we've got more storage, clear any full flags on the space
* infos
@@ -2531,6 +2730,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
@@ -2556,8 +2759,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_sprout_splice().
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2565,7 +2771,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2585,12 +2791,22 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
ret = btrfs_commit_transaction(trans);
}
- /* Update ctime/mtime for libblkid */
+ /*
+ * Now that we have written a new super block to this device, check all
+ * other fs_devices list if device_path alienates any other scanned
+ * device.
+ * We can ignore the return value as it typically returns -EINVAL and
+ * only succeeds if the device was an alien.
+ */
+ btrfs_forget_devices(device->devt);
+
+ /* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
+
return ret;
error_sysfs:
- btrfs_sysfs_rm_device_link(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2609,14 +2825,16 @@ error_sysfs:
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
if (seeding_dev)
- sb->s_flags |= SB_RDONLY;
+ btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
+error_free_zone:
+ btrfs_destroy_dev_zone_info(device);
error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -2676,6 +2894,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
+ int ret;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
@@ -2704,7 +2923,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
- return btrfs_update_device(trans, device);
+ btrfs_reserve_chunk_metadata(trans, false);
+ ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -2755,7 +2978,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u32 cur;
struct btrfs_key key;
- mutex_lock(&fs_info->chunk_mutex);
+ lockdep_assert_held(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2785,7 +3008,6 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
cur += len;
}
}
- mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -2825,6 +3047,29 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
return em;
}
+static int remove_chunk_item(struct btrfs_trans_handle *trans,
+ struct map_lookup *map, u64 chunk_offset)
+{
+ int i;
+
+ /*
+ * Removing chunk items and updating the device items in the chunks btree
+ * requires holding the chunk_mutex.
+ * See the comment at btrfs_chunk_alloc() for the details.
+ */
+ lockdep_assert_held(&trans->fs_info->chunk_mutex);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ int ret;
+
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ if (ret)
+ return ret;
+ }
+
+ return btrfs_free_chunk(trans, chunk_offset);
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2845,14 +3090,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return PTR_ERR(em);
}
map = em->map_lookup;
- mutex_lock(&fs_info->chunk_mutex);
- check_system_chunk(trans, map->type);
- mutex_unlock(&fs_info->chunk_mutex);
/*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+ * First delete the device extent items from the devices btree.
+ * We take the device_list_mutex to avoid racing with the finishing phase
+ * of a device replace operation. See the comment below before acquiring
+ * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
+ * because that can result in a deadlock when deleting the device extent
+ * items from the devices btree - COWing an extent buffer from the btree
+ * may result in allocating a new metadata chunk, which would attempt to
+ * lock again fs_info->chunk_mutex.
*/
mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
@@ -2874,18 +3121,73 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_update_device(trans, device);
+ /*
+ * We acquire fs_info->chunk_mutex for 2 reasons:
+ *
+ * 1) Just like with the first phase of the chunk allocation, we must
+ * reserve system space, do all chunk btree updates and deletions, and
+ * update the system chunk array in the superblock while holding this
+ * mutex. This is for similar reasons as explained on the comment at
+ * the top of btrfs_chunk_alloc();
+ *
+ * 2) Prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of
+ * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
+ * the device item, which does not exists on the chunk btree.
+ * The finishing phase of device replace acquires both the
+ * device_list_mutex and the chunk_mutex, in that order, so we are
+ * safe by just acquiring the chunk_mutex.
+ */
+ trans->removing_chunk = true;
+ mutex_lock(&fs_info->chunk_mutex);
+
+ check_system_chunk(trans, map->type);
+
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ /*
+ * Normally we should not get -ENOSPC since we reserved space before
+ * through the call to check_system_chunk().
+ *
+ * Despite our system space_info having enough free space, we may not
+ * be able to allocate extents from its block groups, because all have
+ * an incompatible profile, which will force us to allocate a new system
+ * block group with the right profile, or right after we called
+ * check_system_space() above, a scrub turned the only system block group
+ * with enough free space into RO mode.
+ * This is explained with more detail at do_chunk_alloc().
+ *
+ * So if we get -ENOSPC, allocate a new system chunk and retry once.
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_create_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
if (ret) {
- mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
}
- }
- mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_free_chunk(trans, chunk_offset);
- if (ret) {
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2900,6 +3202,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
}
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+
+ /*
+ * We are done with chunk btree updates and deletions, so release the
+ * system space we previously reserved (with check_system_chunk()).
+ */
+ btrfs_trans_release_chunk_metadata(trans);
+
ret = btrfs_remove_block_group(trans, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2907,18 +3218,29 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
out:
+ if (trans->removing_chunk) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+ }
/* once for us */
free_extent_map(em);
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *block_group;
+ u64 length;
int ret;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "relocate: not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
@@ -2931,7 +3253,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
+ lockdep_assert_held(&fs_info->reclaim_bgs_lock);
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
@@ -2944,8 +3266,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (!block_group)
return -ENOENT;
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+ length = block_group->length;
btrfs_put_block_group(block_group);
+ /*
+ * On a zoned file system, discard the whole block group, this will
+ * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
+ * resetting the zone fails, don't treat it as a fatal problem from the
+ * filesystem's point of view.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
+ if (ret)
+ btrfs_info(fs_info,
+ "failed to reset zone %llu after relocation",
+ chunk_offset);
+ }
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -2986,10 +3323,10 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
@@ -2997,7 +3334,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto error;
if (ret > 0)
@@ -3018,7 +3355,7 @@ again:
else
BUG_ON(ret);
}
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (found_key.offset == 0)
break;
@@ -3145,7 +3482,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
@@ -3331,10 +3668,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
- if (nparity)
- return num_stripes - nparity;
- else
- return num_stripes / ncopies;
+ return (num_stripes - nparity) / ncopies;
}
/* [pstart, pend) */
@@ -3558,10 +3892,10 @@ again:
goto error;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
@@ -3575,7 +3909,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
ret = 0;
break;
}
@@ -3585,7 +3919,7 @@ again:
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
break;
}
@@ -3602,12 +3936,12 @@ again:
btrfs_release_path(path);
if (!ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
if (counting) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3632,7 +3966,7 @@ again:
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
@@ -3646,7 +3980,7 @@ again:
ret = btrfs_may_alloc_data_chunk(fs_info,
found_key.offset);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
} else if (ret == 1) {
chunk_reserved = 1;
@@ -3654,7 +3988,7 @@ again:
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
} else if (ret == -ETXTBSY) {
@@ -3723,13 +4057,25 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
atomic_read(&fs_info->balance_cancel_req) == 0);
}
-/* Non-zero return value signifies invalidity */
-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
- u64 allowed)
+/*
+ * Validate target profile against allowed profiles and return true if it's OK.
+ * Otherwise print the error message and return false.
+ */
+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
+ const struct btrfs_balance_args *bargs,
+ u64 allowed, const char *type)
{
- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl_arg->target, 1) ||
- (bctl_arg->target & ~allowed)));
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return true;
+
+ /* Profile is valid and does not have bits outside of the allowed set */
+ if (alloc_profile_is_valid(bargs->target, 1) &&
+ (bargs->target & ~allowed) == 0)
+ return true;
+
+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
+ type, btrfs_bg_type_to_raid_name(bargs->target));
+ return false;
}
/*
@@ -3904,7 +4250,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
- atomic_read(&fs_info->balance_cancel_req)) {
+ btrfs_should_cancel_balance(fs_info)) {
ret = -EINVAL;
goto out;
}
@@ -3931,7 +4277,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -3945,24 +4291,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
- if (validate_convert_profile(&bctl->data, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert data profile %s",
- btrfs_bg_type_to_raid_name(bctl->data.target));
- ret = -EINVAL;
- goto out;
- }
- if (validate_convert_profile(&bctl->meta, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert metadata profile %s",
- btrfs_bg_type_to_raid_name(bctl->meta.target));
- ret = -EINVAL;
- goto out;
- }
- if (validate_convert_profile(&bctl->sys, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert system profile %s",
- btrfs_bg_type_to_raid_name(bctl->sys.target));
+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
ret = -EINVAL;
goto out;
}
@@ -4017,14 +4348,6 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
btrfs_bg_type_to_raid_name(data_target));
}
- if (fs_info->send_in_progress) {
- btrfs_warn_rl(fs_info,
-"cannot run balance while send operations are in progress (%d in progress)",
- fs_info->send_in_progress);
- ret = -EAGAIN;
- goto out;
- }
-
ret = insert_balance_item(fs_info, bctl);
if (ret && ret != -EEXIST)
goto out;
@@ -4050,9 +4373,26 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
- if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
btrfs_info(fs_info, "balance: paused");
- else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
+ }
+ /*
+ * Balance can be canceled by:
+ *
+ * - Regular cancel request
+ * Then ret == -ECANCELED and balance_cancel_req > 0
+ *
+ * - Fatal signal to "btrfs" process
+ * Either the signal caught by wait_reserve_ticket() and callers
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
+ * got -ECANCELED.
+ * Either way, in this case balance_cancel_req = 0, and
+ * ret == -EINTR or ret == -ECANCELED.
+ *
+ * So here we only check the return value to catch canceled balance.
+ */
+ else if (ret == -ECANCELED || ret == -EINTR)
btrfs_info(fs_info, "balance: canceled");
else
btrfs_info(fs_info, "balance: ended with status: %d", ret);
@@ -4067,7 +4407,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4078,7 +4418,7 @@ out:
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4088,10 +4428,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
@@ -4112,6 +4454,10 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
/*
* A ro->rw remount sequence should continue with the paused balance
* regardless of who pauses it, system or the user as of now, so set
@@ -4180,10 +4526,12 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
+ btrfs_release_path(path);
+
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
@@ -4262,7 +4610,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -4274,7 +4622,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
return 0;
}
-static int btrfs_uuid_scan_kthread(void *data)
+int btrfs_uuid_scan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_root *root = fs_info->tree_root;
@@ -4286,6 +4634,7 @@ static int btrfs_uuid_scan_kthread(void *data)
struct btrfs_root_item root_item;
u32 item_size;
struct btrfs_trans_handle *trans = NULL;
+ bool closing = false;
path = btrfs_alloc_path();
if (!path) {
@@ -4298,6 +4647,10 @@ static int btrfs_uuid_scan_kthread(void *data)
key.offset = 0;
while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ closing = true;
+ break;
+ }
ret = btrfs_search_forward(root, &key, path,
BTRFS_OLDEST_GENERATION);
if (ret) {
@@ -4314,7 +4667,7 @@ static int btrfs_uuid_scan_kthread(void *data)
eb = path->nodes[0];
slot = path->slots[0];
- item_size = btrfs_item_size_nr(eb, slot);
+ item_size = btrfs_item_size(eb, slot);
if (item_size < sizeof(root_item))
goto skip;
@@ -4344,6 +4697,7 @@ static int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
@@ -4368,6 +4722,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4375,7 +4730,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -4397,76 +4751,12 @@ out:
btrfs_end_transaction(trans);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
- else
+ else if (!closing)
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
up(&fs_info->uuid_tree_rescan_sem);
return 0;
}
-/*
- * Callback for btrfs_uuid_tree_iterate().
- * returns:
- * 0 check succeeded, the entry is not outdated.
- * < 0 if an error occurred.
- * > 0 if the check failed, which means the caller shall remove the entry.
- */
-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
- u8 *uuid, u8 type, u64 subid)
-{
- struct btrfs_key key;
- int ret = 0;
- struct btrfs_root *subvol_root;
-
- if (type != BTRFS_UUID_KEY_SUBVOL &&
- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
- goto out;
-
- key.objectid = subid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(subvol_root)) {
- ret = PTR_ERR(subvol_root);
- if (ret == -ENOENT)
- ret = 1;
- goto out;
- }
-
- switch (type) {
- case BTRFS_UUID_KEY_SUBVOL:
- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
- ret = 1;
- break;
- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
- if (memcmp(uuid, subvol_root->root_item.received_uuid,
- BTRFS_UUID_SIZE))
- ret = 1;
- break;
- }
-
-out:
- return ret;
-}
-
-static int btrfs_uuid_rescan_kthread(void *data)
-{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
- int ret;
-
- /*
- * 1st step is to iterate through the existing UUID tree and
- * to delete all entries that contain outdated data.
- * 2nd step is to add all missing entries to the UUID tree.
- */
- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
- if (ret < 0) {
- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
- up(&fs_info->uuid_tree_rescan_sem);
- return ret;
- }
- return btrfs_uuid_scan_kthread(data);
-}
-
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
@@ -4509,22 +4799,6 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
-{
- struct task_struct *task;
-
- down(&fs_info->uuid_tree_rescan_sem);
- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
- if (IS_ERR(task)) {
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
- btrfs_warn(fs_info, "failed to start uuid_rescan task");
- up(&fs_info->uuid_tree_rescan_sem);
- return PTR_ERR(task);
- }
-
- return 0;
-}
-
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -4599,19 +4873,18 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
- if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret < 0)
- goto done;
if (ret) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ if (ret < 0)
+ goto done;
ret = 0;
btrfs_release_path(path);
break;
@@ -4622,7 +4895,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@@ -4631,7 +4904,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@@ -4647,12 +4920,12 @@ again:
*/
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
} else if (ret) {
@@ -4682,6 +4955,10 @@ again:
}
mutex_lock(&fs_info->chunk_mutex);
+ /* Clear all state bits beyond the shrunk device size */
+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK);
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
@@ -4692,8 +4969,10 @@ again:
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
+ btrfs_reserve_chunk_metadata(trans, false);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
+ btrfs_trans_release_chunk_metadata(trans);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -4722,13 +5001,12 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
u32 array_size;
u8 *ptr;
- mutex_lock(&fs_info->chunk_mutex);
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
- mutex_unlock(&fs_info->chunk_mutex);
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
- }
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4737,7 +5015,6 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
- mutex_unlock(&fs_info->chunk_mutex);
return 0;
}
@@ -4777,96 +5054,137 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID1C34);
}
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- u64 start, u64 type)
-{
- struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct btrfs_device *device;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct btrfs_device_info *devices_info = NULL;
- u64 total_avail;
- int num_stripes; /* total number of stripes to allocate */
- int data_stripes; /* number of stripes that count for
- block group size */
- int sub_stripes; /* sub_stripes info for map */
- int dev_stripes; /* stripes per dev */
- int devs_max; /* max devs to use */
- int devs_min; /* min devs needed */
- int devs_increment; /* ndevs has to be a multiple of this */
- int ncopies; /* how many copies to data has */
- int nparity; /* number of stripes worth of bytes to
- store parity information */
- int ret;
+/*
+ * Structure used internally for btrfs_create_chunk() function.
+ * Wraps needed parameters.
+ */
+struct alloc_chunk_ctl {
+ u64 start;
+ u64 type;
+ /* Total number of stripes to allocate */
+ int num_stripes;
+ /* sub_stripes info for map */
+ int sub_stripes;
+ /* Stripes per device */
+ int dev_stripes;
+ /* Maximum number of devices to use */
+ int devs_max;
+ /* Minimum number of devices to use */
+ int devs_min;
+ /* ndevs has to be a multiple of this */
+ int devs_increment;
+ /* Number of copies */
+ int ncopies;
+ /* Number of stripes worth of bytes to store parity information */
+ int nparity;
u64 max_stripe_size;
u64 max_chunk_size;
+ u64 dev_extent_min;
u64 stripe_size;
u64 chunk_size;
int ndevs;
- int i;
- int j;
- int index;
+};
- BUG_ON(!alloc_profile_is_valid(type, 0));
+static void init_alloc_chunk_ctl_policy_regular(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ struct btrfs_space_info *space_info;
- if (list_empty(&fs_devices->alloc_list)) {
- if (btrfs_test_opt(info, ENOSPC_DEBUG))
- btrfs_debug(info, "%s: no writable device", __func__);
- return -ENOSPC;
- }
+ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
+ ASSERT(space_info);
- index = btrfs_bg_flags_to_raid_index(type);
+ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
+ ctl->max_stripe_size = ctl->max_chunk_size;
- sub_stripes = btrfs_raid_array[index].sub_stripes;
- dev_stripes = btrfs_raid_array[index].dev_stripes;
- devs_max = btrfs_raid_array[index].devs_max;
- if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info);
- devs_min = btrfs_raid_array[index].devs_min;
- devs_increment = btrfs_raid_array[index].devs_increment;
- ncopies = btrfs_raid_array[index].ncopies;
- nparity = btrfs_raid_array[index].nparity;
+ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
+ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
+ /* We don't want a chunk larger than 10% of writable space */
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size);
+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+}
+
+static void init_alloc_chunk_ctl_policy_zoned(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 zone_size = fs_devices->fs_info->zone_size;
+ u64 limit;
+ int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+ int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+ u64 min_chunk_size = min_data_stripes * zone_size;
+ u64 type = ctl->type;
+
+ ctl->max_stripe_size = zone_size;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = SZ_1G;
- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
+ ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+ zone_size);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* for larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- max_stripe_size = SZ_1G;
- else
- max_stripe_size = SZ_256M;
- max_chunk_size = max_stripe_size;
+ ctl->max_chunk_size = ctl->max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = SZ_32M;
- max_chunk_size = 2 * max_stripe_size;
- devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
- btrfs_err(info, "invalid chunk type 0x%llx requested",
- type);
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
- max_chunk_size);
+ limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ zone_size),
+ min_chunk_size);
+ ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+ ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
+
+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
+ ctl->devs_max = btrfs_raid_array[index].devs_max;
+ if (!ctl->devs_max)
+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
+ ctl->devs_min = btrfs_raid_array[index].devs_min;
+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
+ ctl->ncopies = btrfs_raid_array[index].ncopies;
+ ctl->nparity = btrfs_raid_array[index].nparity;
+ ctl->ndevs = 0;
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+ break;
+ default:
+ BUG();
+ }
+}
- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
- GFP_NOFS);
- if (!devices_info)
- return -ENOMEM;
+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ u64 total_avail;
+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
+ int ret;
+ int ndevs = 0;
+ u64 max_avail;
+ u64 dev_offset;
/*
* in the first pass through the devices list, we gather information
* about the available holes on each device.
*/
- ndevs = 0;
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- u64 max_avail;
- u64 dev_offset;
-
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
@@ -4884,24 +5202,23 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
total_avail = 0;
/* If there is no space on this device, skip it. */
- if (total_avail == 0)
+ if (total_avail < ctl->dev_extent_min)
continue;
- ret = find_free_dev_extent(device,
- max_stripe_size * dev_stripes,
- &dev_offset, &max_avail);
+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
+ &max_avail);
if (ret && ret != -ENOSPC)
- goto error;
+ return ret;
if (ret == 0)
- max_avail = max_stripe_size * dev_stripes;
+ max_avail = dev_extent_want;
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+ if (max_avail < ctl->dev_extent_min) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info,
- "%s: devid %llu has no free space, have=%llu want=%u",
+ "%s: devid %llu has no free space, have=%llu want=%llu",
__func__, device->devid, max_avail,
- BTRFS_STRIPE_LEN * dev_stripes);
+ ctl->dev_extent_min);
continue;
}
@@ -4916,6 +5233,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
devices_info[ndevs].dev = device;
++ndevs;
}
+ ctl->ndevs = ndevs;
/*
* now sort the devices by hole size / available space
@@ -4923,23 +5241,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
- /*
- * Round down to number of usable stripes, devs_increment can be any
- * number so we can't use round_down()
- */
- ndevs -= ndevs % devs_increment;
-
- if (ndevs < devs_min) {
- ret = -ENOSPC;
- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
- btrfs_debug(info,
- "%s: not enough devices with free space: have=%d minimum required=%d",
- __func__, ndevs, devs_min);
- }
- goto error;
- }
+ return 0;
+}
- ndevs = min(ndevs, devs_max);
+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ /* Number of stripes that count for block group size */
+ int data_stripes;
/*
* The primary goal is to maximize the number of stripes, so use as
@@ -4948,73 +5257,154 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* The DUP profile stores more than one stripe per device, the
* max_avail is the total size so we have to adjust.
*/
- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
- num_stripes = ndevs * dev_stripes;
+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
- /*
- * this will have to be fixed for RAID1 and RAID10 over
- * more drives
- */
- data_stripes = (num_stripes - nparity) / ncopies;
+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
/*
- * Use the number of data stripes to figure out how big this chunk
- * is really going to be in terms of logical address space,
- * and compare that answer with the max chunk size. If it's higher,
- * we try to reduce stripe_size.
+ * Use the number of data stripes to figure out how big this chunk is
+ * really going to be in terms of logical address space, and compare
+ * that answer with the max chunk size. If it's higher, we try to
+ * reduce stripe_size.
*/
- if (stripe_size * data_stripes > max_chunk_size) {
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
/*
* Reduce stripe_size, round it up to a 16MB boundary again and
* then use it, unless it ends up being even bigger than the
* previous value we had already.
*/
- stripe_size = min(round_up(div_u64(max_chunk_size,
- data_stripes), SZ_16M),
- stripe_size);
+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
+ data_stripes), SZ_16M),
+ ctl->stripe_size);
}
- /* align to BTRFS_STRIPE_LEN */
- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
+ /* Stripe size should not go beyond 1G. */
+ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- ret = -ENOMEM;
- goto error;
+ /* Align to BTRFS_STRIPE_LEN */
+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+ /* Number of stripes that count for block group size */
+ int data_stripes;
+
+ /*
+ * It should hold because:
+ * dev_extent_min == dev_extent_want == zone_size * dev_stripes
+ */
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+ ctl->stripe_size = zone_size;
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+ /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+ ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+ ctl->stripe_size) + ctl->nparity,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
}
- map->num_stripes = num_stripes;
- for (i = 0; i < ndevs; ++i) {
- for (j = 0; j < dev_stripes; ++j) {
- int s = i * dev_stripes + j;
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+
+ /*
+ * Round down to number of usable stripes, devs_increment can be any
+ * number so we can't use round_down() that requires power of 2, while
+ * rounddown is safe.
+ */
+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
+
+ if (ctl->ndevs < ctl->devs_min) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ctl->ndevs, ctl->devs_min);
+ }
+ return -ENOSPC;
+ }
+
+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return decide_stripe_size_regular(ctl, devices_info);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return decide_stripe_size_zoned(ctl, devices_info);
+ default:
+ BUG();
+ }
+}
+
+static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct btrfs_block_group *block_group;
+ struct extent_map *em;
+ u64 start = ctl->start;
+ u64 type = ctl->type;
+ int ret;
+ int i;
+ int j;
+
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+ map->num_stripes = ctl->num_stripes;
+
+ for (i = 0; i < ctl->ndevs; ++i) {
+ for (j = 0; j < ctl->dev_stripes; ++j) {
+ int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
- j * stripe_size;
+ j * ctl->stripe_size;
}
}
map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
- map->sub_stripes = sub_stripes;
+ map->sub_stripes = ctl->sub_stripes;
- chunk_size = stripe_size * data_stripes;
-
- trace_btrfs_chunk_alloc(info, map, start, chunk_size);
+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
em = alloc_extent_map();
if (!em) {
kfree(map);
- ret = -ENOMEM;
- goto error;
+ return ERR_PTR(-ENOMEM);
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
- em->len = chunk_size;
+ em->len = ctl->chunk_size;
em->block_start = 0;
em->block_len = em->len;
- em->orig_block_len = stripe_size;
+ em->orig_block_len = ctl->stripe_size;
em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
@@ -5022,31 +5412,32 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
- goto error;
+ return ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
- if (ret)
+ block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+ if (IS_ERR(block_group))
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
- btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
+ btrfs_device_set_bytes_used(dev,
+ dev->bytes_used + ctl->stripe_size);
if (list_empty(&dev->post_commit_list))
list_add_tail(&dev->post_commit_list,
&trans->transaction->dev_update_list);
}
- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
+ atomic64_sub(ctl->stripe_size * map->num_stripes,
+ &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
- kfree(devices_info);
- return 0;
+ return block_group;
error_del_extent:
write_lock(&em_tree->lock);
@@ -5057,82 +5448,151 @@ error_del_extent:
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
-error:
+
+ return block_group;
+}
+
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
+ u64 type)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct btrfs_device_info *devices_info = NULL;
+ struct alloc_chunk_ctl ctl;
+ struct btrfs_block_group *block_group;
+ int ret;
+
+ lockdep_assert_held(&info->chunk_mutex);
+
+ if (!alloc_profile_is_valid(type, 0)) {
+ ASSERT(0);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+ ASSERT(0);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ctl.start = find_next_chunk(info);
+ ctl.type = type;
+ init_alloc_chunk_ctl(fs_devices, &ctl);
+
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+ return ERR_PTR(-ENOMEM);
+
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
+ goto out;
+ }
+
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
+ goto out;
+ }
+
+ block_group = create_chunk(trans, &ctl, devices_info);
+
+out:
kfree(devices_info);
- return ret;
+ return block_group;
}
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size)
+/*
+ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
+ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
+ * chunks.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
- struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
- u64 dev_offset;
- u64 stripe_size;
- int i = 0;
- int ret = 0;
+ int i;
+ int ret;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ /*
+ * We take the chunk_mutex for 2 reasons:
+ *
+ * 1) Updates and insertions in the chunk btree must be done while holding
+ * the chunk_mutex, as well as updating the system chunk array in the
+ * superblock. See the comment on top of btrfs_chunk_alloc() for the
+ * details;
+ *
+ * 2) To prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * which would cause a failure when updating the device item, which does
+ * not exists, or persisting a stripe of the chunk item with such ID.
+ * Here we can't use the device_list_mutex because our caller already
+ * has locked the chunk_mutex, and the final phase of device replace
+ * acquires both mutexes - first the device_list_mutex and then the
+ * chunk_mutex. Using any of those two mutexes protects us from a
+ * concurrent device replace.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
- stripe_size = em->orig_block_len;
chunk = kzalloc(item_size, GFP_NOFS);
if (!chunk) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
- /*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with the map's stripes, because the device object's id can change
- * at any time during that final phase of the device replace operation
- * (dev-replace.c:btrfs_dev_replace_finishing()).
- */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_update_device(trans, device);
if (ret)
- break;
- ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
- dev_offset, stripe_size);
- if (ret)
- break;
- }
- if (ret) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out;
+ goto out;
}
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_set_stack_chunk_length(chunk, chunk_size);
- btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+ btrfs_set_stack_chunk_length(chunk, bg->length);
+ btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
@@ -5143,15 +5603,18 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.type = BTRFS_CHUNK_ITEM_KEY;
- key.offset = chunk_offset;
+ key.offset = bg->start;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
- if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- /*
- * TODO: Cleanup of inserted chunk root in case of
- * failure.
- */
+ if (ret)
+ goto out;
+
+ set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+ if (ret)
+ goto out;
}
out:
@@ -5160,40 +5623,45 @@ out:
return ret;
}
-/*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
- */
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
-{
- u64 chunk_offset;
-
- lockdep_assert_held(&trans->fs_info->chunk_mutex);
- chunk_offset = find_next_chunk(trans->fs_info);
- return __btrfs_alloc_chunk(trans, chunk_offset, type);
-}
-
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- u64 chunk_offset;
- u64 sys_chunk_offset;
u64 alloc_profile;
- int ret;
+ struct btrfs_block_group *meta_bg;
+ struct btrfs_block_group *sys_bg;
+
+ /*
+ * When adding a new device for sprouting, the seed device is read-only
+ * so we must first allocate a metadata and a system chunk. But before
+ * adding the block group items to the extent, device and chunk btrees,
+ * we must first:
+ *
+ * 1) Create both chunks without doing any changes to the btrees, as
+ * otherwise we would get -ENOSPC since the block groups from the
+ * seed device are read-only;
+ *
+ * 2) Add the device item for the new sprout device - finishing the setup
+ * of a new block group requires updating the device item in the chunk
+ * btree, so it must exist when we attempt to do it. The previous step
+ * ensures this does not fail with -ENOSPC.
+ *
+ * After that we can add the block group items to their btrees:
+ * update existing device item in the chunk btree, add a new block group
+ * item to the extent btree, add a new chunk item to the chunk btree and
+ * finally add the new device extent items to the devices btree.
+ */
- chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
- if (ret)
- return ret;
+ meta_bg = btrfs_create_chunk(trans, alloc_profile);
+ if (IS_ERR(meta_bg))
+ return PTR_ERR(meta_bg);
- sys_chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
- return ret;
+ sys_bg = btrfs_create_chunk(trans, alloc_profile);
+ if (IS_ERR(sys_bg))
+ return PTR_ERR(sys_bg);
+
+ return 0;
}
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
@@ -5203,17 +5671,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
return btrfs_raid_array[index].tolerated_failures;
}
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- int readonly = 0;
int miss_ndevs = 0;
int i;
+ bool ret = true;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
- return 1;
+ return false;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
@@ -5224,21 +5692,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
- readonly = 1;
+ ret = false;
goto end;
}
}
/*
- * If the number of missing devices is larger than max errors,
- * we can not write the data into that chunk successfully, so
- * set it readonly.
+ * If the number of missing devices is larger than max errors, we can
+ * not write the data into that chunk successfully.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
- readonly = 1;
+ ret = false;
end:
free_extent_map(em);
- return readonly;
+ return ret;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5264,7 +5731,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
- int ret;
+ enum btrfs_raid_types index;
+ int ret = 1;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
@@ -5277,10 +5745,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
map = em->map_lookup;
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
- ret = map->num_stripes;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- ret = map->sub_stripes;
+ index = btrfs_bg_flags_to_raid_index(map->type);
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ ret = btrfs_raid_array[index].ncopies;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
@@ -5292,8 +5761,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- else
- ret = 1;
free_extent_map(em);
down_read(&fs_info->dev_replace.rwsem);
@@ -5312,6 +5779,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return len;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
@@ -5329,6 +5799,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return 0;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
@@ -5358,7 +5831,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ switch (fs_info->fs_devices->read_policy) {
+ default:
+ /* Shouldn't happen, just warn and use pid instead of failing */
+ btrfs_warn_rl(fs_info,
+ "unknown read_policy type %u, reset to pid",
+ fs_info->fs_devices->read_policy);
+ fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fallthrough;
+ case BTRFS_READ_POLICY_PID:
+ preferred_mirror = first + (current->pid % num_stripes);
+ break;
+ }
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
@@ -5389,85 +5873,77 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-static inline int parity_smaller(u64 a, u64 b)
-{
- return a > b;
-}
-
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
{
- struct btrfs_bio_stripe s;
int i;
- u64 l;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
- if (parity_smaller(bbio->raid_map[i],
- bbio->raid_map[i+1])) {
- s = bbio->stripes[i];
- l = bbio->raid_map[i];
- bbio->stripes[i] = bbio->stripes[i+1];
- bbio->raid_map[i] = bbio->raid_map[i+1];
- bbio->stripes[i+1] = s;
- bbio->raid_map[i+1] = l;
-
+ /* Swap if parity is on a smaller index */
+ if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+ swap(bioc->stripes[i], bioc->stripes[i + 1]);
+ swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
again = 1;
}
}
}
}
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ int total_stripes,
+ int real_stripes)
{
- struct btrfs_bio *bbio = kzalloc(
- /* the size of the btrfs_bio */
- sizeof(struct btrfs_bio) +
- /* plus the variable array for the stripes */
- sizeof(struct btrfs_bio_stripe) * (total_stripes) +
- /* plus the variable array for the tgt dev */
+ struct btrfs_io_context *bioc = kzalloc(
+ /* The size of btrfs_io_context */
+ sizeof(struct btrfs_io_context) +
+ /* Plus the variable array for the stripes */
+ sizeof(struct btrfs_io_stripe) * (total_stripes) +
+ /* Plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
- * plus the raid_map, which includes both the tgt dev
- * and the stripes
+ * Plus the raid_map, which includes both the tgt dev
+ * and the stripes.
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bbio->error, 0);
- refcount_set(&bbio->refs, 1);
+ refcount_set(&bioc->refs, 1);
+
+ bioc->fs_info = fs_info;
+ bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+ bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
- return bbio;
+ return bioc;
}
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
{
- WARN_ON(!refcount_read(&bbio->refs));
- refcount_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bioc->refs));
+ refcount_inc(&bioc->refs);
}
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
{
- if (!bbio)
+ if (!bioc)
return;
- if (refcount_dec_and_test(&bbio->refs))
- kfree(bbio);
+ if (refcount_dec_and_test(&bioc->refs))
+ kfree(bioc);
}
-/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
-static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 *length_ret,
- struct btrfs_bio **bbio_ret)
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_bio *bbio;
+ struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5476,29 +5952,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 stripe_cnt;
u64 stripe_len;
u64 stripe_offset;
- u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
- int ret = 0;
+ int ret;
int i;
- /* discard always return a bbio */
- ASSERT(bbio_ret);
-
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
- return PTR_ERR(em);
+ return ERR_CAST(em);
map = em->map_lookup;
+
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
- goto out;
- }
+ goto out_free_map;
+}
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
@@ -5524,7 +5997,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- num_stripes = 1;
+ *num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
@@ -5534,7 +6007,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
- num_stripes = min_t(u64, map->num_stripes,
+ *num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= sub_stripes;
@@ -5544,32 +6017,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = map->num_stripes;
+ *num_stripes = map->num_stripes;
} else {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
}
- bbio = alloc_btrfs_bio(num_stripes, 0);
- if (!bbio) {
+ stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+ if (!stripes) {
ret = -ENOMEM;
- goto out;
+ goto out_free_map;
}
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
+ for (i = 0; i < *num_stripes; i++) {
+ stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
- map->stripe_len;
+ stripes[i].length = stripes_per_dev * map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
+ stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -5580,19 +6051,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
+ stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
+ stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bbio->stripes[i].length = length;
+ stripes[i].length = length;
}
stripe_index++;
@@ -5602,12 +6071,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
-out:
free_extent_map(em);
- return ret;
+ return stripes;
+out_free_map:
+ free_extent_map(em);
+ return ERR_PTR(ret);
}
/*
@@ -5628,7 +6096,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
@@ -5637,20 +6105,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bbio, 0, 0);
+ logical, &length, &bioc, NULL, NULL, 0);
if (ret) {
- ASSERT(bbio == NULL);
+ ASSERT(bioc == NULL);
return ret;
}
- num_stripes = bbio->num_stripes;
+ num_stripes = bioc->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
return -EIO;
}
@@ -5660,7 +6128,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid != srcdev_devid)
+ if (bioc->stripes[i].dev->devid != srcdev_devid)
continue;
/*
@@ -5668,15 +6136,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
* mirror with the lowest physical address
*/
if (found &&
- physical_of_found <= bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
- btrfs_put_bbio(bbio);
+ btrfs_put_bioc(bioc);
ASSERT(found);
if (!found)
@@ -5687,12 +6155,30 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
return ret;
}
+static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+ bool ret;
+
+ /* Non zoned filesystem does not use "to_copy" flag */
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+
+ ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_bio **bbio_ret,
+ struct btrfs_io_context **bioc_ret,
struct btrfs_dev_replace *dev_replace,
+ u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
- struct btrfs_bio *bbio = *bbio_ret;
+ struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
@@ -5703,6 +6189,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
int index_where_to_add;
/*
+ * A block group which have "to_copy" set will eventually
+ * copied by dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk to
* the new disk takes place at run time while the filesystem is
@@ -5715,17 +6208,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
+ struct btrfs_io_stripe *new =
+ bioc->stripes + index_where_to_add;
+ struct btrfs_io_stripe *old =
+ bioc->stripes + i;
new->physical = old->physical;
- new->length = old->length;
new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
+ bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
@@ -5745,30 +6237,27 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ if (bioc->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
+ physical_of_found <= bioc->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
- physical_of_found = bbio->stripes[i].physical;
+ physical_of_found = bioc->stripes[i].physical;
}
}
if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
+ struct btrfs_io_stripe *tgtdev_stripe =
+ bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
+ bioc->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
@@ -5777,8 +6266,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
- bbio->num_tgtdevs = tgtdev_indexes;
- *bbio_ret = bbio;
+ bioc->num_tgtdevs = tgtdev_indexes;
+ *bioc_ret = bioc;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -5787,60 +6276,50 @@ static bool need_full_stripe(enum btrfs_map_op op)
}
/*
- * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
- * tuple. This information is used to calculate how big a
- * particular bio can get before it straddles a stripe.
+ * Calculate the geometry of a particular (address, len) tuple. This
+ * information is used to calculate how big a particular bio can get before it
+ * straddles a stripe.
*
- * @fs_info - the filesystem
- * @logical - address that we want to figure out the geometry of
- * @len - the length of IO we are going to perform, starting at @logical
- * @op - type of operation - write or read
- * @io_geom - pointer used to return values
+ * @fs_info: the filesystem
+ * @em: mapping containing the logical extent
+ * @op: type of operation - write or read
+ * @logical: address that we want to figure out the geometry of
+ * @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ enum btrfs_map_op op, u64 logical,
+ struct btrfs_io_geometry *io_geom)
{
- struct extent_map *em;
struct map_lookup *map;
+ u64 len;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
- u64 stripe_len;
+ u32 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
- int ret = 0;
ASSERT(op != BTRFS_MAP_DISCARD);
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
- /* Stripe wher this block falls in */
- stripe_nr = div64_u64(offset, stripe_len);
- /* Offset of stripe in the chunk */
- stripe_offset = stripe_nr * stripe_len;
- if (offset < stripe_offset) {
- btrfs_crit(fs_info,
-"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
- stripe_offset, offset, em->start, logical, stripe_len);
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * Stripe_nr is where this block falls in
+ * stripe_offset is the offset of this block in its stripe.
+ */
+ stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+ ASSERT(stripe_offset < U32_MAX);
- /* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ /* Only stripe based profiles needs to check against stripe length. */
+ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
u64 max_len = stripe_len - stripe_offset;
/*
@@ -5880,17 +6359,22 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
-out:
- /* once for us */
- free_extent_map(em);
- return ret;
+ return 0;
+}
+
+static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+ u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+{
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret,
- int mirror_num, int need_raid_map)
+ enum btrfs_map_op op, u64 logical, u64 *length,
+ struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -5901,10 +6385,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int data_stripes;
int i;
int ret = 0;
+ int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
- struct btrfs_bio *bbio = NULL;
+ struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
@@ -5913,18 +6398,16 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
- ASSERT(bbio_ret);
+ ASSERT(bioc_ret);
+ ASSERT(op != BTRFS_MAP_DISCARD);
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
+ ASSERT(!IS_ERR(em));
- ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
if (ret < 0)
return ret;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- ASSERT(!IS_ERR(em));
map = em->map_lookup;
*length = geom.len;
@@ -6003,6 +6486,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
@@ -6010,9 +6494,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
- max_errors = nr_parity_stripes(map);
+ max_errors = btrfs_chunk_max_errors(map);
- *length = map->stripe_len;
+ /* Return the length to the full stripe end */
+ *length = min(logical + *length,
+ raid56_full_stripe_start + em->start +
+ data_stripes * stripe_len) - logical;
stripe_index = 0;
stripe_offset = 0;
} else {
@@ -6059,68 +6546,78 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
- bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
- if (!bbio) {
+ /*
+ * If this I/O maps to a single device, try to return the device and
+ * physical block information on the stack instead of allocating an
+ * I/O context structure.
+ */
+ if (smap && num_alloc_stripes == 1 &&
+ !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+ (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ !dev_replace->tgtdev)) {
+ if (patch_the_first_stripe_for_dev_replace) {
+ smap->dev = dev_replace->tgtdev;
+ smap->physical = physical_to_patch_in_first_stripe;
+ *mirror_num_ret = map->num_stripes + 1;
+ } else {
+ set_io_stripe(smap, map, stripe_index, stripe_offset,
+ stripe_nr);
+ *mirror_num_ret = mirror_num;
+ }
+ *bioc_ret = NULL;
+ ret = 0;
+ goto out;
+ }
+
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ if (!bioc) {
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
- /* build raid_map */
+ for (i = 0; i < num_stripes; i++) {
+ set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+ stripe_nr);
+ stripe_index++;
+ }
+
+ /* Build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
- bbio->raid_map = (u64 *)((void *)bbio->stripes +
- sizeof(struct btrfs_bio_stripe) *
- num_alloc_stripes +
- sizeof(int) * tgtdev_indexes);
-
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
- bbio->raid_map[(i+rot) % num_stripes] =
+ bioc->raid_map[(i + rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
- bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bbio->raid_map[(i+rot+1) % num_stripes] =
+ bioc->raid_map[(i + rot + 1) % num_stripes] =
RAID6_Q_STRIPE;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
+ sort_parity_stripes(bioc, num_stripes);
}
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
- if (bbio->raid_map)
- sort_parity_stripes(bbio, num_stripes);
-
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
- &max_errors);
+ handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
+ &num_stripes, &max_errors);
}
- *bbio_ret = bbio;
- bbio->map_type = map->type;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+ *bioc_ret = bioc;
+ bioc->map_type = map->type;
+ bioc->num_stripes = num_stripes;
+ bioc->max_errors = max_errors;
+ bioc->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6129,9 +6626,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
- bbio->stripes[0].dev = dev_replace->tgtdev;
- bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
- bbio->mirror_num = map->num_stripes + 1;
+ bioc->stripes[0].dev = dev_replace->tgtdev;
+ bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+ bioc->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing) {
@@ -6145,173 +6642,245 @@ out:
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num)
+ struct btrfs_io_context **bioc_ret, int mirror_num)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
- mirror_num, 0);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, &mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret)
+ struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, NULL, 1);
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+/*
+ * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+ btrfs_bio_end_io_t end_io, void *private)
{
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
- bio_endio(bio);
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->end_io = end_io;
+ bbio->private = private;
+}
- btrfs_put_bbio(bbio);
+/*
+ * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ return bio;
}
-static void btrfs_end_bio(struct bio *bio)
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private)
{
- struct btrfs_bio *bbio = bio->bi_private;
- int is_orig_bio = 0;
+ struct bio *bio;
+ struct btrfs_bio *bbio;
- if (bio->bi_status) {
- atomic_inc(&bbio->error);
- if (bio->bi_status == BLK_STS_IOERR ||
- bio->bi_status == BLK_STS_TARGET) {
- unsigned int stripe_index =
- btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
-
- BUG_ON(stripe_index >= bbio->num_stripes);
- dev = bbio->stripes[stripe_index].dev;
- if (dev->bdev) {
- if (bio_op(bio) == REQ_OP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
- }
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, end_io, private);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ bbio->iter = bio->bi_iter;
+ return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+ if (!dev || !dev->bdev)
+ return;
+ if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+ return;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ if (bio->bi_opf & REQ_META)
+ return fs_info->endio_meta_workers;
+ return fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+
+ bbio->end_io(bbio);
+}
+
+static void btrfs_simple_end_io(struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(fs_info);
+
+ if (bio->bi_status)
+ btrfs_log_dev_io_error(bio, bbio->device);
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+ } else {
+ bbio->end_io(bbio);
}
+}
- if (bio == bbio->orig_bio)
- is_orig_bio = 1;
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+ struct btrfs_io_context *bioc = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
- btrfs_bio_counter_dec(bbio->fs_info);
+ btrfs_bio_counter_dec(bioc->fs_info);
+ bbio->mirror_num = bioc->mirror_num;
+ bbio->end_io(bbio);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
- if (!is_orig_bio) {
- bio_put(bio);
- bio = bbio->orig_bio;
- }
+ btrfs_put_bioc(bioc);
+}
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
- /* only send an error to the higher layers if it is
- * beyond the tolerance of the btrfs bio
- */
- if (atomic_read(&bbio->error) > bbio->max_errors) {
- bio->bi_status = BLK_STS_IOERR;
- } else {
- /*
- * this bio is actually up to date, we didn't
- * go over the max number of errors
- */
- bio->bi_status = BLK_STS_OK;
- }
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
- btrfs_end_bbio(bbio, bio);
- } else if (!is_orig_bio) {
- bio_put(bio);
+ if (bio->bi_status) {
+ atomic_inc(&bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
}
+
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
+
+ bbio->end_io(bbio);
+ btrfs_put_bioc(bioc);
}
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr)
+static void btrfs_clone_write_end_io(struct bio *bio)
{
- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
- struct btrfs_fs_info *fs_info = bbio->fs_info;
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+
+ if (bio->bi_status) {
+ atomic_inc(&stripe->bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
+ }
+
+ /* Pass on control to the original bio this one was cloned from */
+ bio_endio(stripe->bioc->orig_bio);
+ bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ bio_io_error(bio);
+ return;
+ }
- bio->bi_private = bbio;
- btrfs_io_bio(bio)->stripe_index = dev_nr;
- bio->bi_end_io = btrfs_end_bio;
- bio->bi_iter.bi_sector = physical >> 9;
- btrfs_debug_in_rcu(fs_info,
- "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
- bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
- btrfs_bio_counter_inc_noblocked(fs_info);
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical,
+ dev->fs_info->zone_size);
- btrfsic_submit_bio(bio);
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
+ btrfs_debug_in_rcu(dev->fs_info,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
+ dev->devid, bio->bi_iter.bi_size);
+
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
}
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{
- atomic_inc(&bbio->error);
- if (atomic_dec_and_test(&bbio->stripes_pending)) {
- /* Should be the original bio. */
- WARN_ON(bio != bbio->orig_bio);
+ struct bio *orig_bio = bioc->orig_bio, *bio;
- btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
- bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bbio->error) > bbio->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
- btrfs_end_bbio(bbio, bio);
+ ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+ /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+ if (dev_nr == bioc->num_stripes - 1) {
+ bio = orig_bio;
+ bio->bi_end_io = btrfs_orig_write_end_io;
+ } else {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ bio->bi_end_io = btrfs_clone_write_end_io;
}
+
+ bio->bi_private = &bioc->stripes[dev_nr];
+ bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+ bioc->stripes[dev_nr].bioc = bioc;
+ btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num)
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
{
- struct btrfs_device *dev;
- struct bio *first_bio = bio;
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
+ u64 logical = bio->bi_iter.bi_sector << 9;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_io_stripe smap;
int ret;
- int dev_nr;
- int total_devs;
- struct btrfs_bio *bbio = NULL;
-
- length = bio->bi_iter.bi_size;
- map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bbio, mirror_num, 1);
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
- }
-
- total_devs = bbio->num_stripes;
- bbio->orig_bio = first_bio;
- bbio->private = first_bio->bi_private;
- bbio->end_io = first_bio->bi_end_io;
- bbio->fs_info = fs_info;
- atomic_set(&bbio->stripes_pending, bbio->num_stripes);
-
- if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
- /* In this case, map_length has been set to the length of
- a single stripe; not the whole write */
- if (bio_op(bio) == REQ_OP_WRITE) {
- ret = raid56_parity_write(fs_info, bio, bbio,
- map_length);
- } else {
- ret = raid56_parity_recover(fs_info, bio, bbio,
- map_length, mirror_num, 1);
- }
-
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return;
}
if (map_length < length) {
@@ -6321,26 +6890,58 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
BUG();
}
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bbio->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
- &dev->dev_state) ||
- (bio_op(first_bio) == REQ_OP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bbio_error(bbio, first_bio, logical);
- continue;
- }
-
- if (dev_nr < total_devs - 1)
- bio = btrfs_bio_clone(first_bio);
+ if (!bioc) {
+ /* Single mirror read/write fast path */
+ btrfs_bio(bio)->mirror_num = mirror_num;
+ btrfs_bio(bio)->device = smap.dev;
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ bio->bi_private = fs_info;
+ bio->bi_end_io = btrfs_simple_end_io;
+ btrfs_submit_dev_bio(smap.dev, bio);
+ } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* Parity RAID write or read recovery */
+ bio->bi_private = bioc;
+ bio->bi_end_io = btrfs_raid56_end_io;
+ if (bio_op(bio) == REQ_OP_READ)
+ raid56_parity_recover(bio, bioc, mirror_num);
else
- bio = first_bio;
+ raid56_parity_write(bio, bioc);
+ } else {
+ /* Write to multiple mirrors */
+ int total_devs = bioc->num_stripes;
+ int dev_nr;
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
- dev_nr);
+ bioc->orig_bio = bio;
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ btrfs_submit_mirrored_bio(bioc, dev_nr);
}
- btrfs_bio_counter_dec(fs_info);
- return BLK_STS_OK;
+}
+
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_fs_devices *fs_devices)
+{
+ if (args->fsid == NULL)
+ return true;
+ if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+ return true;
+ return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+ const struct btrfs_device *device)
+{
+ if (args->missing) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+ !device->bdev)
+ return true;
+ return false;
+ }
+
+ if (device->devid != args->devid)
+ return false;
+ if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+ return false;
+ return true;
}
/*
@@ -6349,31 +6950,29 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
- *
- * If @seed is true, traverse through the seed devices.
*/
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid,
- bool seed)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
- while (fs_devices) {
- if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
- dev_list) {
- if (device->devid == devid &&
- (!uuid || memcmp(device->uuid, uuid,
- BTRFS_UUID_SIZE) == 0))
- return device;
- }
+ if (dev_args_match_fs_devices(args, fs_devices)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
+ }
+ }
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ if (!dev_args_match_fs_devices(args, seed_devs))
+ continue;
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ if (dev_args_match_device(args, device))
+ return device;
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6381,8 +6980,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6419,9 +7027,18 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
- if (IS_ERR(dev))
- return dev;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->post_commit_list);
+
+ atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
if (devid)
tmp = *devid;
@@ -6455,24 +7072,79 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+u64 btrfs_calc_stripe_length(const struct extent_map *em)
{
- int index = btrfs_bg_flags_to_raid_index(type);
- int ncopies = btrfs_raid_array[index].ncopies;
- const int nparity = btrfs_raid_array[index].nparity;
- int data_stripes;
+ const struct map_lookup *map = em->map_lookup;
+ const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- if (nparity)
- data_stripes = num_stripes - nparity;
- else
- data_stripes = num_stripes / ncopies;
+ return div_u64(em->len, data_stripes);
+}
+
+#if BITS_PER_LONG == 32
+/*
+ * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
+ * can't be accessed on 32bit systems.
+ *
+ * This function do mount time check to reject the fs if it already has
+ * metadata chunk beyond that limit.
+ */
+static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return 0;
+
+ if (logical + length < MAX_LFS_FILESIZE)
+ return 0;
+
+ btrfs_err_32bit_limit(fs_info);
+ return -EOVERFLOW;
+}
+
+/*
+ * This is to give early warning for any metadata chunk reaching
+ * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
+ * Although we can still access the metadata, it's not going to be possible
+ * once the limit is reached.
+ */
+static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return;
+
+ if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ return;
+
+ btrfs_warn_32bit_limit(fs_info);
+}
+#endif
+
+static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return ERR_PTR(-ENOENT);
+ }
+
+ dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
+ if (IS_ERR(dev)) {
+ btrfs_err(fs_info, "failed to init missing device %llu: %ld",
+ devid, PTR_ERR(dev));
+ return dev;
+ }
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
- return div_u64(chunk_len, data_stripes);
+ return dev;
}
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
@@ -6480,15 +7152,26 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
u64 logical;
u64 length;
u64 devid;
+ u64 type;
u8 uuid[BTRFS_UUID_SIZE];
+ int index;
int num_stripes;
int ret;
int i;
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+ index = btrfs_bg_flags_to_raid_index(type);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+#if BITS_PER_LONG == 32
+ ret = check_32bit_meta_chunk(fs_info, logical, length, type);
+ if (ret < 0)
+ return ret;
+ warn_32bit_meta_chunk(fs_info, logical, length, type);
+#endif
+
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
@@ -6532,42 +7215,39 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
- map->type = btrfs_chunk_type(leaf, chunk);
- map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ map->type = type;
+ /*
+ * We can't use the sub_stripes value, as for profiles other than
+ * RAID10, they may have 0 as sub_stripes for filesystems created by
+ * older mkfs (<v5.4).
+ * In that case, it can cause divide-by-zero errors later.
+ * Since currently sub_stripes is fixed for each profile, let's
+ * use the trusted value instead.
+ */
+ map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(map->type, em->len,
- map->num_stripes);
+ em->orig_block_len = btrfs_calc_stripe_length(em);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+ args.devid = devid;
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL, true);
- if (!map->stripes[i].dev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
- free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid, true);
- return -ENOENT;
- }
+ args.uuid = uuid;
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!map->stripes[i].dev) {
- map->stripes[i].dev =
- add_missing_dev(fs_info->fs_devices, devid,
- uuid);
+ map->stripes[i].dev = handle_missing_device(fs_info,
+ devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
- btrfs_err(fs_info,
- "failed to init missing dev %llu: %ld",
- devid, PTR_ERR(map->stripes[i].dev));
return PTR_ERR(map->stripes[i].dev);
}
- btrfs_report_missing_device(fs_info, devid, uuid, false);
}
+
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
-
}
write_lock(&map_tree->lock);
@@ -6615,13 +7295,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6637,6 +7315,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6644,26 +7326,24 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
@@ -6673,10 +7353,13 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 dev_uuid[BTRFS_UUID_SIZE];
devid = btrfs_device_id(leaf, dev_item);
+ args.devid = devid;
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
+ args.uuid = dev_uuid;
+ args.fsid = fs_uuid;
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -6684,8 +7367,7 @@ static int read_one_dev(struct extent_buffer *leaf,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ device = btrfs_find_device(fs_info->fs_devices, &args);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -6748,6 +7430,16 @@ static int read_one_dev(struct extent_buffer *leaf,
}
fill_device_from_item(leaf, dev_item, device);
+ if (device->bdev) {
+ u64 max_total_bytes = bdev_nr_bytes(device->bdev);
+
+ if (device->total_bytes > max_total_bytes) {
+ btrfs_err(fs_info,
+ "device total_bytes should be at most %llu but found %llu",
+ max_total_bytes, device->total_bytes);
+ return -EINVAL;
+ }
+ }
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@ -6761,7 +7453,6 @@ static int read_one_dev(struct extent_buffer *leaf,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
@@ -6777,30 +7468,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+
/*
- * This will create extent buffer of nodesize, superblock size is
- * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
- * overallocate but we can keep it as-is, only the first page is used.
+ * We allocated a dummy extent, just to use extent buffer accessors.
+ * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+ * that's fine, we will not go beyond system chunk array anyway.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
set_extent_buffer_uptodate(sb);
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
- /*
- * The sb extent buffer is artificial and just used to read the system array.
- * set_extent_buffer_uptodate() call does not properly mark all it's
- * pages up-to-date when the page is larger: extent does not cover the
- * whole page and consequently check_page_uptodate does not find all
- * the page's extents up-to-date (the hole beyond sb),
- * write_extent_buffer then triggers a WARN_ON.
- *
- * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
- * but sb spans only this function. Add an explicit SetPageUptodate call
- * to silence the warning eg. on PowerPC 64.
- */
- if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
- SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@@ -6945,6 +7622,15 @@ out:
return ret;
}
+static void readahead_tree_node_children(struct extent_buffer *node)
+{
+ int i;
+ const int nr_items = btrfs_header_nritems(node);
+
+ for (i = 0; i < nr_items; i++)
+ btrfs_readahead_node_child(node, i);
+}
+
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -6954,7 +7640,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
int ret;
int slot;
+ int iter_ret = 0;
u64 total_dev = 0;
+ u64 last_ra_node = 0;
path = btrfs_alloc_path();
if (!path)
@@ -6965,7 +7653,27 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
- mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
+
+ /*
+ * Lockdep complains about possible circular locking dependency between
+ * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
+ * used for freeze procection of a fs (struct super_block.s_writers),
+ * which we take when starting a transaction, and extent buffers of the
+ * chunk tree if we call read_one_dev() while holding a lock on an
+ * extent buffer of the chunk tree. Since we are mounting the filesystem
+ * and at this point there can't be any concurrent task modifying the
+ * chunk tree, to keep it simple, just skip locking on the chunk tree.
+ */
+ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+ path->skip_locking = 1;
/*
* Read all device items, and then all the chunk items. All
@@ -6976,21 +7684,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *node = path->nodes[1];
+
leaf = path->nodes[0];
slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- break;
+
+ if (node) {
+ if (last_ra_node != node->start) {
+ readahead_tree_node_children(node);
+ last_ra_node = node->start;
+ }
}
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
@@ -7001,12 +7706,25 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
+
+ /*
+ * We are only called at mount time, so no need to take
+ * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
+ * we always lock first fs_info->chunk_mutex before
+ * acquiring any locks on the chunk tree. This is a
+ * requirement for chunk allocation, see the comment on
+ * top of btrfs_chunk_alloc() for details.
+ */
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
ret = read_one_chunk(&found_key, leaf, chunk);
if (ret)
goto error;
}
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto error;
}
/*
@@ -7014,12 +7732,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
@@ -7032,26 +7750,37 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
}
ret = 0;
error:
- mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
return ret;
}
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
+ int ret = 0;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
+ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ break;
+ }
- fs_devices = fs_devices->seed;
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7077,17 +7806,56 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
-int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ if (!device->fs_info->dev_root)
+ return 0;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7095,43 +7863,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
}
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7162,7 +7909,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
}
if (ret == 0 &&
- btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+ btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
@@ -7242,11 +7989,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
- btrfs_dev_stat_print_on_error(dev);
-}
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
-{
if (!dev->dev_stats_valid)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
@@ -7282,13 +8025,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
+ BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
- true);
+ args.devid = stats->devid;
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7317,36 +8061,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
return 0;
}
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
-{
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- int copy_num;
-
- if (!bdev)
- return;
-
- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
- copy_num++) {
-
- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
- continue;
-
- disk_super = (struct btrfs_super_block *)bh->b_data;
-
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
- }
-
- /* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
- /* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
-}
-
/*
* Update the size and bytes used for each device where it changed. This is
* delayed since we would otherwise get errors while writing out the
@@ -7378,24 +8092,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7412,6 +8108,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
+ struct btrfs_dev_lookup_args args = { .devid = devid };
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
@@ -7434,7 +8131,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
map = em->map_lookup;
- stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+ stripe_len = btrfs_calc_stripe_length(em);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
@@ -7444,6 +8141,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
+ /*
+ * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * space. Although kernel can handle it without problem, better to warn
+ * the users.
+ */
+ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
+ btrfs_warn(fs_info,
+ "devid %llu physical %llu len %llu inside the reserved space",
+ devid, physical_offset, physical_len);
+
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->devid == devid &&
map->stripes[i].physical == physical_offset) {
@@ -7466,26 +8173,14 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
}
- /* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ /* Make sure no dev extent is beyond device boundary */
+ dev = btrfs_find_device(fs_info->fs_devices, &args);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- /* It's possible this device is a dummy for seed device */
- if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
- if (!dev) {
- btrfs_err(fs_info, "failed to find seed devid %llu",
- devid);
- ret = -EUCLEAN;
- goto out;
- }
- }
-
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
@@ -7494,6 +8189,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
goto out;
}
+
+ if (dev->zone_info) {
+ u64 zone_size = dev->zone_info->zone_size;
+
+ if (!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size)) {
+ btrfs_err(fs_info,
+"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
+ devid, physical_offset, physical_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
out:
free_extent_map(em);
return ret;
@@ -7540,6 +8249,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
u64 prev_dev_ext_end = 0;
int ret = 0;
+ /*
+ * We don't have a dev_root because we mounted with ignorebadroots and
+ * failed to load the root, so we want to skip the verification in this
+ * case for sure.
+ *
+ * However if the dev root is fine, but the tree itself is corrupted
+ * we'd still fail to mount. This verification is only to make sure
+ * writes can happen safely, so instead just bypass this check
+ * completely in the case of IGNOREBADROOTS.
+ */
+ if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ return 0;
+
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
@@ -7554,7 +8276,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_item(root, path);
+ ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
@@ -7637,3 +8359,91 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
spin_unlock(&fs_info->swapfile_pins_lock);
return node != NULL;
}
+
+static int relocating_repair_kthread(void *data)
+{
+ struct btrfs_block_group *cache = data;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 target;
+ int ret = 0;
+
+ target = cache->start;
+ btrfs_put_block_group(cache);
+
+ sb_start_write(fs_info->sb);
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ btrfs_info(fs_info,
+ "zoned: skip relocating block group %llu to repair: EBUSY",
+ target);
+ sb_end_write(fs_info->sb);
+ return -EBUSY;
+ }
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+
+ /* Ensure block group still exists */
+ cache = btrfs_lookup_block_group(fs_info, target);
+ if (!cache)
+ goto out;
+
+ if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
+ goto out;
+
+ ret = btrfs_may_alloc_data_chunk(fs_info, target);
+ if (ret < 0)
+ goto out;
+
+ btrfs_info(fs_info,
+ "zoned: relocating block group %llu to repair IO failure",
+ target);
+ ret = btrfs_relocate_chunk(fs_info, target);
+
+out:
+ if (cache)
+ btrfs_put_block_group(cache);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
+
+ return ret;
+}
+
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ /* Do not attempt to repair in degraded state */
+ if (btrfs_test_opt(fs_info, DEGRADED))
+ return true;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return true;
+
+ if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
+ btrfs_put_block_group(cache);
+ return true;
+ }
+
+ kthread_run(relocating_repair_kthread, cache,
+ "btrfs-relocating-repair");
+
+ return true;
+}
+
+int __init btrfs_bioset_init(void)
+{
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+ bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f01552a0785e..099def5613b8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,7 +17,39 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
-struct buffer_head;
+/* Used by sanity check for btrfs_raid_types. */
+#define const_ffs(n) (__builtin_ctzll(n) + 1)
+
+/*
+ * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+ * RAID0 always to be the lowest profile bit.
+ * Although it's part of on-disk format and should never change, do extra
+ * compile-time sanity checks.
+ */
+static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+ const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+ ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+
+/* ilog2() can handle both constants and variables */
+#define BTRFS_BG_FLAG_TO_INDEX(profile) \
+ ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+
+enum btrfs_raid_types {
+ /* SINGLE is the special one as it doesn't have on-disk bit. */
+ BTRFS_RAID_SINGLE = 0,
+
+ BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+ BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+ BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+ BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+ BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+ BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+ BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+ BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+
+ BTRFS_NR_RAID_TYPES
+};
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
@@ -25,11 +57,11 @@ struct btrfs_io_geometry {
/* offset of logical address in chunk */
u64 offset;
/* length of single IO stripe */
- u64 stripe_len;
+ u32 stripe_len;
+ /* offset of address in stripe */
+ u32 stripe_offset;
/* number of stripe where address falls */
u64 stripe_nr;
- /* offset of address in stripe */
- u64 stripe_offset;
/* offset of raid56 stripe into the chunk */
u64 raid56_stripe_offset;
};
@@ -52,6 +84,9 @@ struct btrfs_io_geometry {
#define BTRFS_DEV_STATE_MISSING (2)
#define BTRFS_DEV_STATE_REPLACE_TGT (3)
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
+#define BTRFS_DEV_STATE_NO_READA (5)
+
+struct btrfs_zoned_device_info;
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
@@ -60,15 +95,22 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
struct block_device *bdev;
+ struct btrfs_zoned_device_info *zone_info;
+
/* the mode sent to blkdev_get */
fmode_t mode;
+ /*
+ * Device's major-minor number. Must be set even if the device is not
+ * opened (bdev == NULL), unless the device is missing.
+ */
+ dev_t devt;
unsigned long dev_state;
blk_status_t last_flush_error;
@@ -113,20 +155,13 @@ struct btrfs_device {
/* bytes used on the current transaction */
u64 commit_bytes_used;
- /* for sending down flush barriers */
- struct bio *flush_bio;
+ /* Bio used for flushing device barriers */
+ struct bio flush_bio;
struct completion flush_wait;
/* per-device scrub information */
struct scrub_ctx *scrub_ctx;
- /* readahead state */
- atomic_t reada_in_flight;
- u64 reada_next;
- struct reada_zone *reada_curr_zone;
- struct radix_tree_root reada_zones;
- struct radix_tree_root reada_extents;
-
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid;
@@ -140,6 +175,34 @@ struct btrfs_device {
struct completion kobj_unregister;
/* For sysfs/FSID/devinfo/devid/ */
struct kobject devid_kobj;
+
+ /* Bandwidth limit for scrub, in bytes */
+ u64 scrub_speed_max;
+};
+
+/*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
+ */
+ bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
};
/*
@@ -209,23 +272,61 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+enum btrfs_chunk_allocation_policy {
+ BTRFS_CHUNK_ALLOC_REGULAR,
+ BTRFS_CHUNK_ALLOC_ZONED,
+};
+
+/*
+ * Read policies for mirrored block group profiles, read picks the stripe based
+ * on these policies.
+ */
+enum btrfs_read_policy {
+ /* Use process PID to choose the stripe */
+ BTRFS_READ_POLICY_PID,
+ BTRFS_NR_READ_POLICY,
+};
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
u8 metadata_uuid[BTRFS_FSID_SIZE];
bool fsid_change;
struct list_head fs_list;
+ /*
+ * Number of devices under this fsid including missing and
+ * replace-target device and excludes seed devices.
+ */
u64 num_devices;
+
+ /*
+ * The number of devices that successfully opened, including
+ * replace-target, excludes seed devices.
+ */
u64 open_devices;
+
+ /* The number of devices that are under the chunk allocation list. */
u64 rw_devices;
+
+ /* Count of missing devices under this fsid excluding seed device. */
u64 missing_devices;
u64 total_rw_bytes;
+
+ /*
+ * Count of devices from btrfs_super_block::num_devices for this fsid,
+ * which includes the seed device, excludes the transient replace-target
+ * device.
+ */
u64 total_devices;
/* Highest generation number of seen devices */
u64 latest_generation;
- struct block_device *latest_bdev;
+ /*
+ * The mount device or a device with highest generation after removal
+ * or replace.
+ */
+ struct btrfs_device *latest_dev;
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
@@ -244,7 +345,7 @@ struct btrfs_fs_devices {
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
+ struct list_head seed_list;
bool seeding;
int opened;
@@ -260,6 +361,11 @@ struct btrfs_fs_devices {
struct kobject *devices_kobj;
struct kobject *devinfo_kobj;
struct completion kobj_unregister;
+
+ enum btrfs_chunk_allocation_policy chunk_alloc_policy;
+
+ /* Policy used to read the mirrored stripes */
+ enum btrfs_read_policy read_policy;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -274,55 +380,125 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet. We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios. We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
*/
-struct btrfs_io_bio {
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
+/*
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
+ */
+struct btrfs_bio {
unsigned int mirror_num;
- unsigned int stripe_index;
- u64 logical;
+ struct bvec_iter iter;
+
+ /* for direct I/O */
+ u64 file_offset;
+
+ /* @device is for stripe IO submission. */
+ struct btrfs_device *device;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
- struct bvec_iter iter;
+
+ /* End I/O information supplied to btrfs_bio_alloc */
+ btrfs_bio_end_io_t end_io;
+ void *private;
+
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
- * bytes for entire btrfs_io_bio but relies on bio being last.
+ * bytes for entire btrfs_bio but relies on bio being last.
*/
struct bio bio;
};
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+{
+ return container_of(bio, struct btrfs_bio, bio);
+}
+
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private);
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
- return container_of(bio, struct btrfs_io_bio, bio);
+ bbio->bio.bi_status = status;
+ bbio->end_io(bbio);
}
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
- if (io_bio->csum != io_bio->csum_inline) {
- kfree(io_bio->csum);
- io_bio->csum = NULL;
+ if (bbio->csum != bbio->csum_inline) {
+ kfree(bbio->csum);
+ bbio->csum = NULL;
}
}
-struct btrfs_bio_stripe {
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
+struct btrfs_io_stripe {
+ struct btrfs_device *dev;
+ union {
+ /* Block mapping */
+ u64 physical;
+ /* For the endio handler */
+ struct btrfs_io_context *bioc;
+ };
+};
+
+struct btrfs_discard_stripe {
struct btrfs_device *dev;
u64 physical;
- u64 length; /* only used for discard mappings */
+ u64 length;
};
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ * Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ * Used by submit_stripe_bio() for mapping logical bio
+ * into physical device address.
+ *
+ * - Contain device replace info
+ * Used by handle_ops_on_dev_replace() to copy logical bios
+ * into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
refcount_t refs;
- atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
- bio_end_io_t *end_io;
struct bio *orig_bio;
- void *private;
atomic_t error;
int max_errors;
int num_stripes;
@@ -335,7 +511,7 @@ struct btrfs_bio {
* so raid_map[0] is the start of our full stripe
*/
u64 *raid_map;
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
struct btrfs_device_info {
@@ -366,15 +542,15 @@ struct map_lookup {
u64 type;
int io_align;
int io_width;
- u64 stripe_len;
+ u32 stripe_len;
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
- struct btrfs_bio_stripe stripes[];
+ struct btrfs_io_stripe stripes[];
};
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
+ (sizeof(struct btrfs_io_stripe) * (n)))
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -388,6 +564,22 @@ struct btrfs_balance_control {
struct btrfs_balance_progress stat;
};
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+ u64 devid;
+ u8 *uuid;
+ u8 *fsid;
+ bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+ struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
enum btrfs_map_op {
BTRFS_MAP_READ,
BTRFS_MAP_WRITE,
@@ -401,55 +593,65 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
case REQ_OP_DISCARD:
return BTRFS_MAP_DISCARD;
case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
- /* fall through */
+ fallthrough;
case REQ_OP_READ:
return BTRFS_MAP_READ;
}
}
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num);
+ struct btrfs_io_context **bioc_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret);
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
+ struct btrfs_io_context **bioc_ret);
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes);
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
+ enum btrfs_map_op op, u64 logical,
+ struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
+ u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num);
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
-int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_forget_devices(dev_t devt);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
u64 devid,
const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+ struct btrfs_dev_lookup_args *args,
+ const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
void btrfs_free_device(struct btrfs_device *device);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
- const char *device_path, u64 devid);
+ struct btrfs_dev_lookup_args *args,
+ struct block_device **bdev, fmode_t *mode);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid, bool seed);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+ const struct btrfs_dev_lookup_args *args);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -459,31 +661,34 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int btrfs_uuid_scan_kthread(void *data);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats);
-void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size);
+u64 btrfs_calc_stripe_length(const struct extent_map *em);
+int btrfs_nr_parity_stripes(u64 type);
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
@@ -536,42 +741,21 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
atomic_inc(&dev->dev_stats_ccnt);
}
-/*
- * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
- * can be used as index to access btrfs_raid_array[].
- */
-static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
+bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 95d9aebff2c4..5bb8d8c86311 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* matches our target xattr, so lets check.
*/
ret = 0;
- btrfs_assert_tree_locked(path->nodes[0]);
+ btrfs_assert_tree_write_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
@@ -168,9 +168,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const int slot = path->slots[0];
struct extent_buffer *leaf = path->nodes[0];
const u16 old_data_len = btrfs_dir_data_len(leaf, di);
- const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 item_size = btrfs_item_size(leaf, slot);
const u32 data_size = sizeof(*di) + name_len + size;
- struct btrfs_item *item;
unsigned long data_ptr;
char *ptr;
@@ -196,9 +195,8 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_extend_item(path, data_size);
}
- item = btrfs_item_nr(slot);
ptr = btrfs_item_ptr(leaf, slot, char);
- ptr += btrfs_item_size(leaf, item) - data_size;
+ ptr += btrfs_item_size(leaf, slot) - data_size;
di = (struct btrfs_dir_item *)ptr;
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
@@ -213,9 +211,11 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
}
out:
btrfs_free_path(path);
- if (!ret)
+ if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
+ }
return ret;
}
@@ -227,11 +227,33 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
+ const bool start_trans = (current->journal_info == NULL);
int ret;
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (start_trans) {
+ /*
+ * 1 unit for inserting/updating/deleting the xattr
+ * 1 unit for the inode item update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ } else {
+ /*
+ * This can happen when smack is enabled and a directory is being
+ * created. It happens through d_instantiate_new(), which calls
+ * smack_d_instantiate(), which in turn calls __vfs_setxattr() to
+ * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the
+ * inode. We have already reserved space for the xattr and inode
+ * update at btrfs_mkdir(), so just use the transaction handle.
+ * We don't join or start a transaction, as that will reset the
+ * block_rsv of the handle and trigger a warning for the start
+ * case.
+ */
+ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) == 0);
+ trans = current->journal_info;
+ }
ret = btrfs_setxattr(trans, inode, name, value, size, flags);
if (ret)
@@ -239,19 +261,23 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
- btrfs_end_transaction(trans);
+ if (start_trans)
+ btrfs_end_transaction(trans);
return ret;
}
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct btrfs_key found_key;
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
+ int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@@ -270,47 +296,26 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path->reada = READA_FORWARD;
/* search for our xattrs */
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf;
int slot;
struct btrfs_dir_item *di;
- struct btrfs_key found_key;
u32 item_size;
u32 cur;
leaf = path->nodes[0];
slot = path->slots[0];
- /* this is where we start walking through the path */
- if (slot >= btrfs_header_nritems(leaf)) {
- /*
- * if we've reached the last slot in this leaf we need
- * to go to the next leaf and reset everything
- */
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
- goto next_item;
+ continue;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- item_size = btrfs_item_size_nr(leaf, slot);
+ item_size = btrfs_item_size(leaf, slot);
cur = 0;
while (cur < item_size) {
u16 name_len = btrfs_dir_name_len(leaf, di);
@@ -327,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next;
if (!buffer || (name_len + 1) > size_left) {
- ret = -ERANGE;
- goto err;
+ iter_ret = -ERANGE;
+ break;
}
read_extent_buffer(leaf, buffer, name_ptr, name_len);
@@ -340,12 +345,13 @@ next:
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
-next_item:
- path->slots[0]++;
}
- ret = total_size;
-err:
+ if (iter_ret < 0)
+ ret = iter_ret;
+ else
+ ret = total_size;
+
btrfs_free_path(path);
return ret;
@@ -360,15 +366,20 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
}
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
@@ -378,10 +389,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct btrfs_root *root = BTRFS_I(inode)->root;
name = xattr_full_name(handler, name);
- ret = btrfs_validate_prop(name, value, size);
+ ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
if (ret)
return ret;
+ if (btrfs_ignore_prop(BTRFS_I(inode), name))
+ return 0;
+
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -390,8 +404,9 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (!ret) {
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 05615a1099db..b4f44662cda7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
- char *data_in;
+ char *data_in = NULL;
char *cpage_out;
int nr_pages = 0;
struct page *in_page = NULL;
@@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = kmap_local_page(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = kmap_local_page(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,18 +196,16 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -234,18 +232,16 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
+
return ret;
}
@@ -286,10 +280,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
- data_in = kmap(pages_in[page_in_index]);
+ data_in = kmap_local_page(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -311,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -326,9 +318,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (buf_start == total_out)
break;
- ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- total_out, disk_start,
- orig_bio);
+ ret2 = btrfs_decompress_buf2page(workspace->buf,
+ total_out - buf_start, cb, buf_start);
if (ret2 == 0) {
ret = 0;
goto done;
@@ -339,17 +330,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = kmap(pages_in[page_in_index]);
+ data_in = kmap_local_page(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
- workspace->strm.avail_in = min(tmp,
- PAGE_SIZE);
+ workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
if (ret != Z_STREAM_END)
@@ -359,9 +349,9 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
if (!ret)
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
return ret;
}
@@ -375,7 +365,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long bytes_left;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- char *kaddr;
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes_left = destlen;
@@ -432,9 +421,8 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, bytes_left);
- kaddr = kmap_atomic(dest_page);
- memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
- kunmap_atomic(kaddr);
+ memcpy_to_page(dest_page, pg_offset,
+ workspace->buf + buf_offset, bytes);
pg_offset += bytes;
bytes_left -= bytes;
@@ -456,9 +444,7 @@ next:
* end of the inline extent (destlen) to the end of the page
*/
if (pg_offset < destlen) {
- kaddr = kmap_atomic(dest_page);
- memset(kaddr + pg_offset, 0, destlen - pg_offset);
- kunmap_atomic(kaddr);
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
}
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
new file mode 100644
index 000000000000..1912abf6d020
--- /dev/null
+++ b/fs/btrfs/zoned.c
@@ -0,0 +1,2348 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
+#include <linux/atomic.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "zoned.h"
+#include "rcu-string.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "transaction.h"
+#include "dev-replace.h"
+#include "space-info.h"
+
+/* Maximum number of zones to report per blkdev_report_zones() call */
+#define BTRFS_REPORT_NR_ZONES 4096
+/* Invalid allocation pointer value for missing devices */
+#define WP_MISSING_DEV ((u64)-1)
+/* Pseudo write pointer value for conventional zone */
+#define WP_CONVENTIONAL ((u64)-2)
+
+/*
+ * Location of the first zone of superblock logging zone pairs.
+ *
+ * - primary superblock: 0B (zone 0)
+ * - first copy: 512G (zone starting at that offset)
+ * - second copy: 4T (zone starting at that offset)
+ */
+#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
+#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
+#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
+
+#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+
+/* Number of superblock log zones */
+#define BTRFS_NR_SB_LOG_ZONES 2
+
+/*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
+ * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+ * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+ * We do not expect the zone size to become larger than 8GiB or smaller than
+ * 4MiB in the near future.
+ */
+#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define BTRFS_MIN_ZONE_SIZE SZ_4M
+
+#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+ return (zone->cond == BLK_ZONE_COND_FULL) ||
+ (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
+static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct blk_zone *zones = data;
+
+ memcpy(&zones[idx], zone, sizeof(*zone));
+
+ return 0;
+}
+
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+ u64 *wp_ret)
+{
+ bool empty[BTRFS_NR_SB_LOG_ZONES];
+ bool full[BTRFS_NR_SB_LOG_ZONES];
+ sector_t sector;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+ empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+ full[i] = sb_zone_is_full(&zones[i]);
+ }
+
+ /*
+ * Possible states of log buffer zones
+ *
+ * Empty[0] In use[0] Full[0]
+ * Empty[1] * 0 1
+ * In use[1] x x 1
+ * Full[1] 0 0 C
+ *
+ * Log position:
+ * *: Special case, no superblock is written
+ * 0: Use write pointer of zones[0]
+ * 1: Use write pointer of zones[1]
+ * C: Compare super blocks from zones[0] and zones[1], use the latest
+ * one determined by generation
+ * x: Invalid state
+ */
+
+ if (empty[0] && empty[1]) {
+ /* Special case to distinguish no superblock to read */
+ *wp_ret = zones[0].start << SECTOR_SHIFT;
+ return -ENOENT;
+ } else if (full[0] && full[1]) {
+ /* Compare two super blocks */
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct page *page[BTRFS_NR_SB_LOG_ZONES];
+ struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ u64 bytenr;
+
+ bytenr = ((zones[i].start + zones[i].len)
+ << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
+
+ page[i] = read_cache_page_gfp(mapping,
+ bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page[i])) {
+ if (i == 1)
+ btrfs_release_disk_super(super[0]);
+ return PTR_ERR(page[i]);
+ }
+ super[i] = page_address(page[i]);
+ }
+
+ if (super[0]->generation > super[1]->generation)
+ sector = zones[1].start;
+ else
+ sector = zones[0].start;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ btrfs_release_disk_super(super[i]);
+ } else if (!full[0] && (empty[1] || full[1])) {
+ sector = zones[0].wp;
+ } else if (full[0]) {
+ sector = zones[1].wp;
+ } else {
+ return -EUCLEAN;
+ }
+ *wp_ret = sector << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * Get the first zone number of the superblock mirror
+ */
+static inline u32 sb_zone_number(int shift, int mirror)
+{
+ u64 zone;
+
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ switch (mirror) {
+ case 0: zone = 0; break;
+ case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
+ case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
+ }
+
+ ASSERT(zone <= U32_MAX);
+
+ return (u32)zone;
+}
+
+static inline sector_t zone_start_sector(u32 zone_number,
+ struct block_device *bdev)
+{
+ return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
+}
+
+static inline u64 zone_start_physical(u32 zone_number,
+ struct btrfs_zoned_device_info *zone_info)
+{
+ return (u64)zone_number << zone_info->zone_size_shift;
+}
+
+/*
+ * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
+ * device into static sized chunks and fake a conventional zone on each of
+ * them.
+ */
+static int emulate_report_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int nr_zones)
+{
+ const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
+ sector_t bdev_size = bdev_nr_sectors(device->bdev);
+ unsigned int i;
+
+ pos >>= SECTOR_SHIFT;
+ for (i = 0; i < nr_zones; i++) {
+ zones[i].start = i * zone_sectors + pos;
+ zones[i].len = zone_sectors;
+ zones[i].capacity = zone_sectors;
+ zones[i].wp = zones[i].start + zone_sectors;
+ zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zones[i].cond = BLK_ZONE_COND_NOT_WP;
+
+ if (zones[i].wp >= bdev_size) {
+ i++;
+ break;
+ }
+ }
+
+ return i;
+}
+
+static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zno;
+ int ret;
+
+ if (!*nr_zones)
+ return 0;
+
+ if (!bdev_is_zoned(device->bdev)) {
+ ret = emulate_report_zones(device, pos, zones, *nr_zones);
+ *nr_zones = ret;
+ return 0;
+ }
+
+ /* Check cache */
+ if (zinfo->zone_cache) {
+ unsigned int i;
+
+ ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
+ zno = pos >> zinfo->zone_size_shift;
+ /*
+ * We cannot report zones beyond the zone end. So, it is OK to
+ * cap *nr_zones to at the end.
+ */
+ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
+
+ for (i = 0; i < *nr_zones; i++) {
+ struct blk_zone *zone_info;
+
+ zone_info = &zinfo->zone_cache[zno + i];
+ if (!zone_info->len)
+ break;
+ }
+
+ if (i == *nr_zones) {
+ /* Cache hit on all the zones */
+ memcpy(zones, zinfo->zone_cache + zno,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+ return 0;
+ }
+ }
+
+ ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ copy_zone_info_cb, zones);
+ if (ret < 0) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read zone %llu on %s (devid %llu)",
+ pos, rcu_str_deref(device->name),
+ device->devid);
+ return ret;
+ }
+ *nr_zones = ret;
+ if (!ret)
+ return -EIO;
+
+ /* Populate cache */
+ if (zinfo->zone_cache)
+ memcpy(zinfo->zone_cache + zno, zones,
+ sizeof(*zinfo->zone_cache) * *nr_zones);
+
+ return 0;
+}
+
+/* The emulated zone size is determined from the size of device extent */
+static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_extent *dext;
+ int ret = 0;
+
+ key.objectid = 1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ /* No dev extents at all? Not good */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ /* We can skip reading of zone info for missing devices */
+ if (!device->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone_info(device, true);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_zoned_device_info *zone_info = NULL;
+ struct block_device *bdev = device->bdev;
+ unsigned int max_active_zones;
+ unsigned int nactive;
+ sector_t nr_sectors;
+ sector_t sector = 0;
+ struct blk_zone *zones = NULL;
+ unsigned int i, nreported = 0, nr_zones;
+ sector_t zone_sectors;
+ char *model, *emulated;
+ int ret;
+
+ /*
+ * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
+ * yet be set.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ if (device->zone_info)
+ return 0;
+
+ zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return -ENOMEM;
+
+ device->zone_info = zone_info;
+
+ if (!bdev_is_zoned(bdev)) {
+ if (!fs_info->zone_size) {
+ ret = calculate_emulated_zone_size(fs_info);
+ if (ret)
+ goto out;
+ }
+
+ ASSERT(fs_info->zone_size);
+ zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+ } else {
+ zone_sectors = bdev_zone_sectors(bdev);
+ }
+
+ /* Check if it's power of 2 (see is_power_of_2) */
+ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+
+ /* We reject devices with a zone size larger than 8GB */
+ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu larger than supported maximum %llu",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
+ } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu smaller than supported minimum %u",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nr_sectors = bdev_nr_sectors(bdev);
+ zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ /*
+ * We limit max_zone_append_size also by max_segments *
+ * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
+ * since btrfs adds the pages one by one to a bio, and btrfs cannot
+ * increase the metadata reservation even if it increases the number of
+ * extents, it is safe to stick with the limit.
+ *
+ * With the zoned emulation, we can have non-zoned device on the zoned
+ * mode. In this case, we don't have a valid max zone append size. So,
+ * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
+ */
+ if (bdev_is_zoned(bdev)) {
+ zone_info->max_zone_append_size = min_t(u64,
+ (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ } else {
+ zone_info->max_zone_append_size =
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+ }
+ if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ zone_info->nr_zones++;
+
+ max_active_zones = bdev_max_active_zones(bdev);
+ if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+ btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+ rcu_str_deref(device->name), max_active_zones,
+ BTRFS_MIN_ACTIVE_ZONES);
+ ret = -EINVAL;
+ goto out;
+ }
+ zone_info->max_active_zones = max_active_zones;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Enable zone cache only for a zoned device. On a non-zoned device, we
+ * fill the zone info with emulated CONVENTIONAL zones, so no need to
+ * use the cache.
+ */
+ if (populate_cache && bdev_is_zoned(device->bdev)) {
+ zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
+ zone_info->nr_zones);
+ if (!zone_info->zone_cache) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to allocate zone cache for %s",
+ rcu_str_deref(device->name));
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* Get zones type */
+ nactive = 0;
+ while (sector < nr_sectors) {
+ nr_zones = BTRFS_REPORT_NR_ZONES;
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_zones; i++) {
+ if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+ __set_bit(nreported, zone_info->seq_zones);
+ switch (zones[i].cond) {
+ case BLK_ZONE_COND_EMPTY:
+ __set_bit(nreported, zone_info->empty_zones);
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ __set_bit(nreported, zone_info->active_zones);
+ nactive++;
+ break;
+ }
+ nreported++;
+ }
+ sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
+ }
+
+ if (nreported != zone_info->nr_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "inconsistent number of zones on %s (%u/%u)",
+ rcu_str_deref(device->name), nreported,
+ zone_info->nr_zones);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (max_active_zones) {
+ if (nactive > max_active_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: %u active zones on %s exceeds max_active_zones %u",
+ nactive, rcu_str_deref(device->name),
+ max_active_zones);
+ ret = -EIO;
+ goto out;
+ }
+ atomic_set(&zone_info->active_zones_left,
+ max_active_zones - nactive);
+ }
+
+ /* Validate superblock log */
+ nr_zones = BTRFS_NR_SB_LOG_ZONES;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_wp;
+ int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
+
+ sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+ if (sb_zone + 1 >= zone_info->nr_zones)
+ continue;
+
+ ret = btrfs_get_dev_zones(device,
+ zone_start_physical(sb_zone, zone_info),
+ &zone_info->sb_zones[sb_pos],
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read super block log zone info at devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * If zones[0] is conventional, always use the beginning of the
+ * zone to record superblock. No need to validate in that case.
+ */
+ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
+ BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
+
+ ret = sb_write_pointer(device->bdev,
+ &zone_info->sb_zones[sb_pos], &sb_wp);
+ if (ret != -ENOENT && ret) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: super block log zone corrupted devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+
+ kfree(zones);
+
+ switch (bdev_zoned_model(bdev)) {
+ case BLK_ZONED_HM:
+ model = "host-managed zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_HA:
+ model = "host-aware zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_NONE:
+ model = "regular";
+ emulated = "emulated ";
+ break;
+ default:
+ /* Just in case */
+ btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
+ bdev_zoned_model(bdev),
+ rcu_str_deref(device->name));
+ ret = -EOPNOTSUPP;
+ goto out_free_zone_info;
+ }
+
+ btrfs_info_in_rcu(fs_info,
+ "%s block device %s, %u %szones of %llu bytes",
+ model, rcu_str_deref(device->name), zone_info->nr_zones,
+ emulated, zone_info->zone_size);
+
+ return 0;
+
+out:
+ kfree(zones);
+out_free_zone_info:
+ btrfs_destroy_dev_zone_info(device);
+
+ return ret;
+}
+
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return;
+
+ bitmap_free(zone_info->active_zones);
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ vfree(zone_info->zone_cache);
+ kfree(zone_info);
+ device->zone_info = NULL;
+}
+
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
+{
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return NULL;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones)
+ goto out;
+
+ bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
+ zone_info->nr_zones);
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones)
+ goto out;
+
+ bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
+ zone_info->nr_zones);
+
+ zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->active_zones)
+ goto out;
+
+ bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
+ zone_info->nr_zones);
+ zone_info->zone_cache = NULL;
+
+ return zone_info;
+
+out:
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->active_zones);
+ kfree(zone_info);
+ return NULL;
+}
+
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ unsigned int nr_zones = 1;
+ int ret;
+
+ ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
+ if (ret != 0 || !nr_zones)
+ return ret ? ret : -EIO;
+
+ return 0;
+}
+
+static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ if (device->bdev &&
+ bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found: %pg",
+ device->bdev);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+ u64 zone_size = 0;
+ u64 max_zone_append_size = 0;
+ int ret;
+
+ /*
+ * Host-Managed devices can't be used without the ZONED flag. With the
+ * ZONED all devices can be used, using zone emulation if required.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return btrfs_check_for_zoned_device(fs_info);
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
+ "zoned: unequal block device zone sizes: have %llu found %llu",
+ zone_info->zone_size, zone_size);
+ return -EINVAL;
+ }
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size = zone_info->max_zone_append_size;
+ }
+
+ /*
+ * stripe_size is always aligned to BTRFS_STRIPE_LEN in
+ * btrfs_create_chunk(). Since we want stripe_len == zone_size,
+ * check the alignment here.
+ */
+ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info,
+ "zoned: zone size %llu not aligned to stripe %u",
+ zone_size, BTRFS_STRIPE_LEN);
+ return -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "zoned: mixed block groups not supported");
+ return -EINVAL;
+ }
+
+ fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
+ fs_info->sectorsize);
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+ if (fs_info->max_zone_append_size < fs_info->max_extent_size)
+ fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+ /*
+ * Check mount options here, because we might change fs_info->zoned
+ * from fs_info->zone_size.
+ */
+ ret = btrfs_check_mountopts_zoned(fs_info);
+ if (ret)
+ return ret;
+
+ btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
+ return 0;
+}
+
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ if (!btrfs_is_zoned(info))
+ return 0;
+
+ /*
+ * Space cache writing is not COWed. Disable that to avoid write errors
+ * in sequential zones.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_err(info, "zoned: space cache v1 is not supported");
+ return -EINVAL;
+ }
+
+ if (btrfs_test_opt(info, NODATACOW)) {
+ btrfs_err(info, "zoned: NODATACOW not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+ int rw, u64 *bytenr_ret)
+{
+ u64 wp;
+ int ret;
+
+ if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ *bytenr_ret = zones[0].start << SECTOR_SHIFT;
+ return 0;
+ }
+
+ ret = sb_write_pointer(bdev, zones, &wp);
+ if (ret != -ENOENT && ret < 0)
+ return ret;
+
+ if (rw == WRITE) {
+ struct blk_zone *reset = NULL;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ reset = &zones[0];
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ reset = &zones[1];
+
+ if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ ASSERT(sb_zone_is_full(reset));
+
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ reset->start, reset->len,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ reset->cond = BLK_ZONE_COND_EMPTY;
+ reset->wp = reset->start;
+ }
+ } else if (ret != -ENOENT) {
+ /*
+ * For READ, we want the previous one. Move write pointer to
+ * the end of a zone, if it is at the head of a zone.
+ */
+ u64 zone_end = 0;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ zone_end = zones[1].start + zones[1].capacity;
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ zone_end = zones[0].start + zones[0].capacity;
+ if (zone_end)
+ wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+ BTRFS_SUPER_INFO_SIZE);
+
+ wp -= BTRFS_SUPER_INFO_SIZE;
+ }
+
+ *bytenr_ret = wp;
+ return 0;
+
+}
+
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
+ sector_t zone_sectors;
+ u32 sb_zone;
+ int ret;
+ u8 zone_sectors_shift;
+ sector_t nr_sectors;
+ u32 nr_zones;
+
+ if (!bdev_is_zoned(bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ ASSERT(rw == READ || rw == WRITE);
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors))
+ return -EINVAL;
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
+ BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
+ zones);
+ if (ret < 0)
+ return ret;
+ if (ret != BTRFS_NR_SB_LOG_ZONES)
+ return -EIO;
+
+ return sb_log_location(bdev, zones, rw, bytenr_ret);
+}
+
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zone_num;
+
+ /*
+ * For a zoned filesystem on a non-zoned block device, use the same
+ * super block locations as regular filesystem. Doing so, the super
+ * block can always be retrieved and the zoned flag of the volume
+ * detected from the super block information.
+ */
+ if (!bdev_is_zoned(device->bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return -ENOENT;
+
+ return sb_log_location(device->bdev,
+ &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
+ rw, bytenr_ret);
+}
+
+static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+ int mirror)
+{
+ u32 zone_num;
+
+ if (!zinfo)
+ return false;
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return false;
+
+ if (!test_bit(zone_num, zinfo->seq_zones))
+ return false;
+
+ return true;
+}
+
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ struct blk_zone *zone;
+ int i;
+
+ if (!is_sb_log_zone(zinfo, mirror))
+ return 0;
+
+ zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ /* Advance the next zone */
+ if (zone->cond == BLK_ZONE_COND_FULL) {
+ zone++;
+ continue;
+ }
+
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += SUPER_INFO_SECTORS;
+
+ if (sb_zone_is_full(zone)) {
+ /*
+ * No room left to write new superblock. Since
+ * superblock is written with REQ_SYNC, it is safe to
+ * finish the zone now.
+ *
+ * If the write pointer is exactly at the capacity,
+ * explicit ZONE_FINISH is not necessary.
+ */
+ if (zone->wp != zone->start + zone->capacity) {
+ int ret;
+
+ ret = blkdev_zone_mgmt(device->bdev,
+ REQ_OP_ZONE_FINISH, zone->start,
+ zone->len, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ zone->wp = zone->start + zone->len;
+ zone->cond = BLK_ZONE_COND_FULL;
+ }
+ return 0;
+ }
+
+ /* All the zones are FULL. Should not reach here. */
+ ASSERT(0);
+ return -EIO;
+}
+
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ sector_t zone_sectors;
+ sector_t nr_sectors;
+ u8 zone_sectors_shift;
+ u32 sb_zone;
+ u32 nr_zones;
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+}
+
+/**
+ * btrfs_find_allocatable_zones - find allocatable zones within a given region
+ *
+ * @device: the device to allocate a region on
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: size of wanted region
+ * @hole_end: the end of the hole
+ * @return: position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ u64 nzones = num_bytes >> shift;
+ u64 pos = hole_start;
+ u64 begin, end;
+ bool have_sb;
+ int i;
+
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+ while (pos < hole_end) {
+ begin = pos >> shift;
+ end = begin + nzones;
+
+ if (end > zinfo->nr_zones)
+ return hole_end;
+
+ /* Check if zones in the region are all empty */
+ if (btrfs_dev_is_sequential(device, pos) &&
+ find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+ pos += zinfo->zone_size;
+ continue;
+ }
+
+ have_sb = false;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_pos;
+
+ sb_zone = sb_zone_number(shift, i);
+ if (!(end <= sb_zone ||
+ sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+ have_sb = true;
+ pos = zone_start_physical(
+ sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
+ break;
+ }
+
+ /* We also need to exclude regular superblock positions */
+ sb_pos = btrfs_sb_offset(i);
+ if (!(pos + num_bytes <= sb_pos ||
+ sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+ have_sb = true;
+ pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+ zinfo->zone_size);
+ break;
+ }
+ }
+ if (!have_sb)
+ break;
+ }
+
+ return pos;
+}
+
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return true;
+
+ if (!test_bit(zno, zone_info->active_zones)) {
+ /* Active zone left? */
+ if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+ return false;
+ if (test_and_set_bit(zno, zone_info->active_zones)) {
+ /* Someone already set the bit */
+ atomic_inc(&zone_info->active_zones_left);
+ }
+ }
+
+ return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+ /* We can use any number of zones */
+ if (zone_info->max_active_zones == 0)
+ return;
+
+ if (test_and_clear_bit(zno, zone_info->active_zones))
+ atomic_inc(&zone_info->active_zones_left);
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes)
+{
+ int ret;
+
+ *bytes = 0;
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ *bytes = length;
+ while (length) {
+ btrfs_dev_set_zone_empty(device, physical);
+ btrfs_dev_clear_active_zone(device, physical);
+ physical += device->zone_info->zone_size;
+ length -= device->zone_info->zone_size;
+ }
+
+ return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ unsigned long begin = start >> shift;
+ unsigned long end = (start + size) >> shift;
+ u64 pos;
+ int ret;
+
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+ if (end > zinfo->nr_zones)
+ return -ERANGE;
+
+ /* All the zones are conventional */
+ if (find_next_bit(zinfo->seq_zones, begin, end) == end)
+ return 0;
+
+ /* All the zones are sequential and empty */
+ if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
+ find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
+ return 0;
+
+ for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+ u64 reset_bytes;
+
+ if (!btrfs_dev_is_sequential(device, pos) ||
+ btrfs_dev_is_empty_zone(device, pos))
+ continue;
+
+ /* Free regions should be empty */
+ btrfs_warn_in_rcu(
+ device->fs_info,
+ "zoned: resetting device %s (devid %llu) zone %llu for allocation",
+ rcu_str_deref(device->name), device->devid, pos >> shift);
+ WARN_ON_ONCE(1);
+
+ ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+ &reset_bytes);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate an allocation pointer from the extent allocation information
+ * for a block group consist of conventional zones. It is pointed to the
+ * end of the highest addressed extent in the block group as an allocation
+ * offset.
+ */
+static int calculate_alloc_pointer(struct btrfs_block_group *cache,
+ u64 *offset_ret, bool new)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ u64 length;
+
+ /*
+ * Avoid tree lookups for a new block group, there's no use for it.
+ * It must always be 0.
+ *
+ * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+ * For new a block group, this function is called from
+ * btrfs_make_block_group() which is already taking the chunk mutex.
+ * Thus, we cannot call calculate_alloc_pointer() which takes extent
+ * buffer locks to avoid deadlock.
+ */
+ if (new) {
+ *offset_ret = 0;
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = cache->start + cache->length;
+ key.type = 0;
+ key.offset = 0;
+
+ root = btrfs_extent_root(fs_info, key.objectid);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ /* We should not find the exact match */
+ if (!ret)
+ ret = -EUCLEAN;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_previous_extent_item(root, path, cache->start);
+ if (ret) {
+ if (ret == 1) {
+ ret = 0;
+ *offset_ret = 0;
+ }
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+ length = found_key.offset;
+ else
+ length = fs_info->nodesize;
+
+ if (!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ *offset_ret = found_key.objectid + length - cache->start;
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 logical = cache->start;
+ u64 length = cache->length;
+ int ret;
+ int i;
+ unsigned int nofs_flag;
+ u64 *alloc_offsets = NULL;
+ u64 *caps = NULL;
+ u64 *physical = NULL;
+ unsigned long *active = NULL;
+ u64 last_alloc = 0;
+ u32 num_sequential = 0, num_conventional = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ /* Sanity check */
+ if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu len %llu unaligned to zone size %llu",
+ logical, length, fs_info->zone_size);
+ return -EIO;
+ }
+
+ /* Get the chunk mapping */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em)
+ return -EINVAL;
+
+ map = em->map_lookup;
+
+ cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ if (!cache->physical_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
+ if (!alloc_offsets) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+ if (!caps) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
+ if (!physical) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+ if (!active) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool is_sequential;
+ struct blk_zone zone;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+
+ device = map->stripes[i].dev;
+ physical[i] = map->stripes[i].physical;
+
+ if (device->bdev == NULL) {
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ }
+
+ is_sequential = btrfs_dev_is_sequential(device, physical[i]);
+ if (is_sequential)
+ num_sequential++;
+ else
+ num_conventional++;
+
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
+
+ if (!is_sequential) {
+ alloc_offsets[i] = WP_CONVENTIONAL;
+ continue;
+ }
+
+ /*
+ * This zone will be used for allocation, so mark this zone
+ * non-empty.
+ */
+ btrfs_dev_clear_zone_empty(device, physical[i]);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write
+ * pointer to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, physical[i], &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret == -EIO || ret == -EOPNOTSUPP) {
+ ret = 0;
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ } else if (ret) {
+ goto out;
+ }
+
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT,
+ rcu_str_deref(device->name), device->devid);
+ ret = -EIO;
+ goto out;
+ }
+
+ caps[i] = (zone.capacity << SECTOR_SHIFT);
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ physical[i] >> device->zone_info->zone_size_shift,
+ rcu_str_deref(device->name), device->devid);
+ alloc_offsets[i] = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ alloc_offsets[i] = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ alloc_offsets[i] = caps[i];
+ break;
+ default:
+ /* Partially used zone */
+ alloc_offsets[i] =
+ ((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(i, active);
+ break;
+ }
+ }
+
+ if (num_sequential > 0)
+ cache->seq_zone = true;
+
+ if (num_conventional > 0) {
+ /* Zone capacity is always zone size in emulation */
+ cache->zone_capacity = cache->length;
+ ret = calculate_alloc_pointer(cache, &last_alloc, new);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to determine allocation offset of bg %llu",
+ cache->start);
+ goto out;
+ } else if (map->num_stripes == num_conventional) {
+ cache->alloc_offset = last_alloc;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ goto out;
+ }
+ }
+
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case 0: /* single */
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = caps[0];
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA) {
+ btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[1] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[1]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[0] != alloc_offsets[1]) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ ret = -EIO;
+ goto out;
+ }
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = min(caps[0], caps[1]);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID0:
+ case BTRFS_BLOCK_GROUP_RAID10:
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ /* non-single profiles are not supported yet */
+ default:
+ btrfs_err(fs_info, "zoned: profile %s not yet supported",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ if (cache->alloc_offset > fs_info->zone_size) {
+ btrfs_err(fs_info,
+ "zoned: invalid write pointer %llu in block group %llu",
+ cache->alloc_offset, cache->start);
+ ret = -EIO;
+ }
+
+ if (cache->alloc_offset > cache->zone_capacity) {
+ btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+ cache->alloc_offset, cache->zone_capacity,
+ cache->start);
+ ret = -EIO;
+ }
+
+ /* An extent is allocated after the write pointer */
+ if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
+ btrfs_err(fs_info,
+ "zoned: got wrong write pointer in BG %llu: %llu > %llu",
+ logical, last_alloc, cache->alloc_offset);
+ ret = -EIO;
+ }
+
+ if (!ret) {
+ cache->meta_write_pointer = cache->alloc_offset + cache->start;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+ } else {
+ kfree(cache->physical_map);
+ cache->physical_map = NULL;
+ }
+ bitmap_free(active);
+ kfree(physical);
+ kfree(caps);
+ kfree(alloc_offsets);
+ free_extent_map(em);
+
+ return ret;
+}
+
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
+{
+ u64 unusable, free;
+
+ if (!btrfs_is_zoned(cache->fs_info))
+ return;
+
+ WARN_ON(cache->bytes_super != 0);
+ unusable = (cache->alloc_offset - cache->used) +
+ (cache->length - cache->zone_capacity);
+ free = cache->zone_capacity - cache->alloc_offset;
+
+ /* We only need ->free_space in ALLOC_SEQ block groups */
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->free_space_ctl->free_space = free;
+ cache->zone_unusable = unusable;
+}
+
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ if (!btrfs_is_zoned(fs_info) ||
+ btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
+ !list_empty(&eb->release_list))
+ return;
+
+ set_extent_buffer_dirty(eb);
+ set_extent_bits_nowait(&trans->dirty_pages, eb->start,
+ eb->start + eb->len - 1, EXTENT_DIRTY);
+ memzero_extent_buffer(eb, 0, eb->len);
+ set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
+
+ spin_lock(&trans->releasing_ebs_lock);
+ list_add_tail(&eb->release_list, &trans->releasing_ebs);
+ spin_unlock(&trans->releasing_ebs_lock);
+ atomic_inc(&eb->refs);
+}
+
+void btrfs_free_redirty_list(struct btrfs_transaction *trans)
+{
+ spin_lock(&trans->releasing_ebs_lock);
+ while (!list_empty(&trans->releasing_ebs)) {
+ struct extent_buffer *eb;
+
+ eb = list_first_entry(&trans->releasing_ebs,
+ struct extent_buffer, release_list);
+ list_del_init(&eb->release_list);
+ free_extent_buffer(eb);
+ }
+ spin_unlock(&trans->releasing_ebs_lock);
+}
+
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_group *cache;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ if (!is_data_inode(&inode->vfs_inode))
+ return false;
+
+ /*
+ * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * extent layout the relocation code has.
+ * Furthermore we have set aside own block-group from which only the
+ * relocation "process" can allocate and make sure only one process at a
+ * time can add pages to an extent that gets relocated, so it's safe to
+ * use regular REQ_OP_WRITE for this special case.
+ */
+ if (btrfs_is_data_reloc_root(inode->root))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ ASSERT(cache);
+ if (!cache)
+ return false;
+
+ ret = cache->seq_zone;
+ btrfs_put_block_group(cache);
+
+ return ret;
+}
+
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio)
+{
+ struct btrfs_ordered_extent *ordered;
+ const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (bio_op(bio) != REQ_OP_ZONE_APPEND)
+ return;
+
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
+ if (WARN_ON(!ordered))
+ return;
+
+ ordered->physical = physical;
+ ordered->bdev = bio->bi_bdev;
+
+ btrfs_put_ordered_extent(ordered);
+}
+
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_ordered_sum *sum;
+ u64 orig_logical = ordered->disk_bytenr;
+ u64 *logical = NULL;
+ int nr, stripe_len;
+
+ /* Zoned devices should not have partitions. So, we can assume it is 0 */
+ ASSERT(!bdev_is_partition(ordered->bdev));
+ if (WARN_ON(!ordered->bdev))
+ return;
+
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
+ ordered->physical, &logical, &nr,
+ &stripe_len)))
+ goto out;
+
+ WARN_ON(nr != 1);
+
+ if (orig_logical == *logical)
+ goto out;
+
+ ordered->disk_bytenr = *logical;
+
+ em_tree = &inode->extent_tree;
+ write_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
+ em->block_start = *logical;
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ if (*logical < orig_logical)
+ sum->bytenr -= orig_logical - *logical;
+ else
+ sum->bytenr += *logical - orig_logical;
+ }
+
+out:
+ kfree(logical);
+}
+
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ struct btrfs_block_group *cache;
+ bool ret = true;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ cache = btrfs_lookup_block_group(fs_info, eb->start);
+ if (!cache)
+ return true;
+
+ if (cache->meta_write_pointer != eb->start) {
+ btrfs_put_block_group(cache);
+ cache = NULL;
+ ret = false;
+ } else {
+ cache->meta_write_pointer = eb->start + eb->len;
+ }
+
+ *cache_ret = cache;
+
+ return ret;
+}
+
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+ if (!btrfs_is_zoned(eb->fs_info) || !cache)
+ return;
+
+ ASSERT(cache->meta_write_pointer == eb->start + eb->len);
+ cache->meta_write_pointer = eb->start;
+}
+
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
+{
+ if (!btrfs_dev_is_sequential(device, physical))
+ return -EOPNOTSUPP;
+
+ return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
+ length >> SECTOR_SHIFT, GFP_NOFS, 0);
+}
+
+static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
+ struct blk_zone *zone)
+{
+ struct btrfs_io_context *bioc = NULL;
+ u64 mapped_length = PAGE_SIZE;
+ unsigned int nofs_flag;
+ int nmirrors;
+ int i, ret;
+
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &mapped_length, &bioc);
+ if (ret || !bioc || mapped_length < PAGE_SIZE) {
+ ret = -EIO;
+ goto out_put_bioc;
+ }
+
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EINVAL;
+ goto out_put_bioc;
+ }
+
+ nofs_flag = memalloc_nofs_save();
+ nmirrors = (int)bioc->num_stripes;
+ for (i = 0; i < nmirrors; i++) {
+ u64 physical = bioc->stripes[i].physical;
+ struct btrfs_device *dev = bioc->stripes[i].dev;
+
+ /* Missing device */
+ if (!dev->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone(dev, physical, zone);
+ /* Failing device */
+ if (ret == -EIO || ret == -EOPNOTSUPP)
+ continue;
+ break;
+ }
+ memalloc_nofs_restore(nofs_flag);
+out_put_bioc:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
+ * filling zeros between @physical_pos to a write pointer of dev-replace
+ * source device.
+ */
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos)
+{
+ struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
+ struct blk_zone zone;
+ u64 length;
+ u64 wp;
+ int ret;
+
+ if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
+ return 0;
+
+ ret = read_zone_info(fs_info, logical, &zone);
+ if (ret)
+ return ret;
+
+ wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
+
+ if (physical_pos == wp)
+ return 0;
+
+ if (physical_pos > wp)
+ return -EUCLEAN;
+
+ length = wp - physical_pos;
+ return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
+}
+
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return ERR_CAST(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ device = map->stripes[0].dev;
+
+ free_extent_map(em);
+
+ return device;
+}
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+ bool ret;
+ int i;
+
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return true;
+
+ map = block_group->physical_map;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
+ ret = true;
+ goto out_unlock;
+ }
+
+ /* No space left */
+ if (btrfs_zoned_bg_is_full(block_group)) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+ }
+
+ /* Successfully activated all the zones */
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
+ space_info->active_total_bytes += block_group->length;
+ spin_unlock(&block_group->lock);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ return true;
+
+out_unlock:
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
+static void wait_eb_writebacks(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ const u64 end = block_group->start + block_group->length;
+ struct radix_tree_iter iter;
+ struct extent_buffer *eb;
+ void __rcu **slot;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
+ block_group->start >> fs_info->sectorsize_bits) {
+ eb = radix_tree_deref_slot(slot);
+ if (!eb)
+ continue;
+ if (radix_tree_deref_retry(eb)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (eb->start < block_group->start)
+ continue;
+ if (eb->start >= end)
+ break;
+
+ slot = radix_tree_iter_resume(slot, &iter);
+ rcu_read_unlock();
+ wait_on_extent_buffer_writeback(eb);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct map_lookup *map;
+ const bool is_metadata = (block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
+ int ret = 0;
+ int i;
+
+ spin_lock(&block_group->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+
+ /* Check if we have unwritten allocated space */
+ if (is_metadata &&
+ block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
+ spin_unlock(&block_group->lock);
+ return -EAGAIN;
+ }
+
+ /*
+ * If we are sure that the block group is full (= no more room left for
+ * new allocation) and the IO for the last usable block is completed, we
+ * don't need to wait for the other IOs. This holds because we ensure
+ * the sequential IO submissions using the ZONE_APPEND command for data
+ * and block_group->meta_write_pointer for metadata.
+ */
+ if (!fully_written) {
+ spin_unlock(&block_group->lock);
+
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+ /* Wait for extent buffers to be written. */
+ if (is_metadata)
+ wait_eb_writebacks(block_group);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
+ }
+
+ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
+ block_group->alloc_offset = block_group->zone_capacity;
+ block_group->free_space_ctl->free_space = 0;
+ btrfs_clear_treelog_bg(block_group);
+ btrfs_clear_data_reloc_bg(block_group);
+ spin_unlock(&block_group->lock);
+
+ map = block_group->physical_map;
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 physical = map->stripes[i].physical;
+
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
+
+ if (ret)
+ return ret;
+
+ btrfs_dev_clear_active_zone(device, physical);
+ }
+
+ if (!fully_written)
+ btrfs_dec_block_group_ro(block_group);
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+
+ clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+
+ return 0;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return 0;
+
+ return do_zone_finish(block_group, false);
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+{
+ struct btrfs_fs_info *fs_info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+ if (!device->bdev)
+ continue;
+
+ if (!zinfo->max_active_zones ||
+ atomic_read(&zinfo->active_zones_left)) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (!ret)
+ set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+
+ return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+ struct btrfs_block_group *block_group;
+ u64 min_alloc_bytes;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ ASSERT(block_group);
+
+ /* No MIXED_BG on zoned btrfs. */
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ min_alloc_bytes = fs_info->sectorsize;
+ else
+ min_alloc_bytes = fs_info->nodesize;
+
+ /* Bail out if we can allocate more data from this block group. */
+ if (logical + length + min_alloc_bytes <=
+ block_group->start + block_group->zone_capacity)
+ goto out;
+
+ do_zone_finish(block_group, true);
+
+out:
+ btrfs_put_block_group(block_group);
+}
+
+static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+{
+ struct btrfs_block_group *bg =
+ container_of(work, struct btrfs_block_group, zone_finish_work);
+
+ wait_on_extent_buffer_writeback(bg->last_eb);
+ free_extent_buffer(bg->last_eb);
+ btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ btrfs_put_block_group(bg);
+}
+
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb)
+{
+ if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ return;
+
+ if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+ btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+ bg->start);
+ return;
+ }
+
+ /* For the work */
+ btrfs_get_block_group(bg);
+ atomic_inc(&eb->refs);
+ bg->last_eb = eb;
+ INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+ queue_work(system_unbound_wq, &bg->zone_finish_work);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg == bg->start)
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
+
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->zone_info) {
+ vfree(device->zone_info->zone_cache);
+ device->zone_info->zone_cache = NULL;
+ }
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 used = 0;
+ u64 total = 0;
+ u64 factor;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+
+ if (fs_info->bg_reclaim_threshold == 0)
+ return false;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ total += device->disk_total_bytes;
+ used += device->bytes_used;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ factor = div64_u64(used * 100, total);
+ return factor >= fs_info->bg_reclaim_threshold;
+}
+
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ /* It should be called on a previous data relocation block group. */
+ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
+
+ spin_lock(&block_group->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
+ goto out;
+
+ /* All relocation extents are written. */
+ if (block_group->start + block_group->alloc_offset == logical + length) {
+ /* Now, release this block group for further allocations. */
+ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags);
+ }
+
+out:
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+}
+
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_block_group *min_bg = NULL;
+ u64 min_avail = U64_MAX;
+ int ret;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs,
+ active_bg_list) {
+ u64 avail;
+
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (min_avail > avail) {
+ if (min_bg)
+ btrfs_put_block_group(min_bg);
+ min_bg = block_group;
+ min_avail = avail;
+ btrfs_get_block_group(min_bg);
+ }
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ if (!min_bg)
+ return 0;
+
+ ret = btrfs_zone_finish(min_bg);
+ btrfs_put_block_group(min_bg);
+
+ return ret < 0 ? ret : 1;
+}
+
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ struct btrfs_block_group *bg;
+ int index;
+
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
+ /* No more block groups to activate */
+ if (space_info->active_total_bytes == space_info->total_bytes)
+ return 0;
+
+ for (;;) {
+ int ret;
+ bool need_finish = false;
+
+ down_read(&space_info->groups_sem);
+ for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
+ list_for_each_entry(bg, &space_info->block_groups[index],
+ list) {
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_zoned_bg_is_full(bg) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+
+ if (btrfs_zone_activate(bg)) {
+ up_read(&space_info->groups_sem);
+ return 1;
+ }
+
+ need_finish = true;
+ }
+ }
+ up_read(&space_info->groups_sem);
+
+ if (!do_finish || !need_finish)
+ break;
+
+ ret = btrfs_zone_finish_one_bg(fs_info);
+ if (ret == 0)
+ break;
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
new file mode 100644
index 000000000000..8bd16d40b7c6
--- /dev/null
+++ b/fs/btrfs/zoned.h
@@ -0,0 +1,420 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ZONED_H
+#define BTRFS_ZONED_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include "volumes.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "btrfs_inode.h"
+
+#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
+
+struct btrfs_zoned_device_info {
+ /*
+ * Number of zones, zone size and types of zones if bdev is a
+ * zoned block device.
+ */
+ u64 zone_size;
+ u8 zone_size_shift;
+ u64 max_zone_append_size;
+ u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
+ unsigned long *seq_zones;
+ unsigned long *empty_zones;
+ unsigned long *active_zones;
+ struct blk_zone *zone_cache;
+ struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
+};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone);
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
+int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret);
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes);
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes);
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb);
+void btrfs_free_redirty_list(struct btrfs_transaction *trans);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio);
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret);
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb);
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos);
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, bool do_finish);
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
+ bool populate_cache)
+{
+ return 0;
+}
+
+static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+
+/*
+ * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
+ * into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
+ */
+static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
+ struct btrfs_device *orig_dev)
+{
+ return NULL;
+}
+
+static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_err(fs_info, "zoned block devices support is not enabled");
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ return 0;
+}
+
+static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
+ int mirror, int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
+ int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ return 0;
+}
+
+static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ return 0;
+}
+
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+ u64 hole_start, u64 hole_end,
+ u64 num_bytes)
+{
+ return hole_start;
+}
+
+static inline int btrfs_reset_device_zone(struct btrfs_device *device,
+ u64 physical, u64 length, u64 *bytes)
+{
+ *bytes = 0;
+ return 0;
+}
+
+static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
+ u64 start, u64 size)
+{
+ return 0;
+}
+
+static inline int btrfs_load_block_group_zone_info(
+ struct btrfs_block_group *cache, bool new)
+{
+ return 0;
+}
+
+static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
+
+static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb) { }
+static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
+
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
+{
+ return false;
+}
+
+static inline void btrfs_record_physical_zoned(struct inode *inode,
+ u64 file_offset, struct bio *bio)
+{
+}
+
+static inline void btrfs_rewrite_logical_zoned(
+ struct btrfs_ordered_extent *ordered) { }
+
+static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ return true;
+}
+
+static inline void btrfs_revert_meta_write_pointer(
+ struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+}
+
+static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
+ u64 logical, u64 physical_start,
+ u64 physical_pos)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct btrfs_device *btrfs_zoned_get_device(
+ struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+ return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+ u64 flags)
+{
+ return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
+static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
+
+static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ return 1;
+}
+
+static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ /* Consider all the block groups are active */
+ return 0;
+}
+
+#endif
+
+static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return false;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones);
+}
+
+static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return true;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device,
+ u64 pos, bool set)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno;
+
+ if (!zone_info)
+ return;
+
+ zno = pos >> zone_info->zone_size_shift;
+ if (set)
+ set_bit(zno, zone_info->empty_zones);
+ else
+ clear_bit(zno, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, true);
+}
+
+static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, false);
+}
+
+static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
+ struct block_device *bdev)
+{
+ if (btrfs_is_zoned(fs_info)) {
+ /*
+ * We can allow a regular device on a zoned filesystem, because
+ * we will emulate the zoned capabilities.
+ */
+ if (!bdev_is_zoned(bdev))
+ return true;
+
+ return fs_info->zone_size ==
+ (bdev_zone_sectors(bdev) << SECTOR_SHIFT);
+ }
+
+ /* Do not allow Host Manged zoned device */
+ return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+}
+
+static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
+{
+ /*
+ * On a non-zoned device, any address is OK. On a zoned device,
+ * non-SEQUENTIAL WRITE REQUIRED zones are capable.
+ */
+ return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
+}
+
+static inline bool btrfs_can_zone_reset(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ u64 zone_size;
+
+ if (!btrfs_dev_is_sequential(device, physical))
+ return false;
+
+ zone_size = device->zone_info->zone_size;
+ if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size))
+ return false;
+
+ return true;
+}
+
+static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_lock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_unlock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg == bg->start)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+}
+
+static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
+static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
+static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
+{
+ ASSERT(btrfs_is_zoned(bg->fs_info));
+ return (bg->alloc_offset == bg->zone_capacity);
+}
+
+#endif
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 9a4871636c6c..35a0224d4eb7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -28,10 +28,10 @@
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
size_t src_len)
{
- ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
+ zstd_parameters params = zstd_get_params(level, src_len);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -48,8 +48,8 @@ struct workspace {
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
- ZSTD_inBuffer in_buf;
- ZSTD_outBuffer out_buf;
+ zstd_in_buffer in_buf;
+ zstd_out_buffer out_buf;
};
/*
@@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
-/*
- * zstd_reclaim_timer_fn - reclaim timer
+
+/**
+ * Timer callback to free unused workspaces.
+ *
* @t: timer
*
* This scans the lru_list and attempts to reclaim any workspace that hasn't
* been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ *
+ * The context is softirq and does not need the _bh locking primitives.
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- spin_lock_bh(&wsm.lock);
+ spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
return;
}
@@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
}
/*
@@ -155,12 +159,12 @@ static void zstd_calc_ws_mem_sizes(void)
unsigned int level;
for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
- ZSTD_parameters params =
+ zstd_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
max_t(size_t,
- ZSTD_CStreamWorkspaceBound(params.cParams),
- ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+ zstd_cstream_workspace_bound(&params.cParams),
+ zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
zstd_ws_mem_sizes[level - 1] = max_size;
@@ -371,7 +375,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- ZSTD_CStream *stream;
+ zstd_cstream *stream;
int ret = 0;
int nr_pages = 0;
struct page *in_page = NULL; /* The current page to read */
@@ -381,7 +385,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
*out_pages = 0;
@@ -389,40 +393,40 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = 0;
/* Initialize the stream */
- stream = ZSTD_initCStream(params, len, workspace->mem,
+ stream = zstd_init_cstream(&params, len, workspace->mem,
workspace->size);
if (!stream) {
- pr_warn("BTRFS: ZSTD_initCStream failed\n");
+ pr_warn("BTRFS: zstd_init_cstream failed\n");
ret = -EIO;
goto out;
}
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
while (1) {
size_t ret2;
- ret2 = ZSTD_compressStream(stream, &workspace->out_buf,
+ ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_compress_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto out;
}
@@ -446,19 +450,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -473,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
- kunmap(in_page);
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
-
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -487,10 +488,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
while (1) {
size_t ret2;
- ret2 = ZSTD_endStream(stream, &workspace->out_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_endStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ ret2 = zstd_end_stream(stream, &workspace->out_buf);
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_end_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto out;
}
@@ -506,19 +507,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -533,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = tot_out;
out:
*out_pages = nr_pages;
- /* Cleanup */
- if (in_page) {
- kunmap(in_page);
+ if (workspace->in_buf.src) {
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
}
- if (out_page)
- kunmap(out_page);
return ret;
}
@@ -547,25 +543,23 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
size_t srclen = cb->compressed_len;
- ZSTD_DStream *stream;
+ zstd_dstream *stream;
int ret = 0;
unsigned long page_in_index = 0;
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long total_out = 0;
- stream = ZSTD_initDStream(
+ stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (!stream) {
- pr_debug("BTRFS: ZSTD_initDStream failed\n");
+ pr_debug("BTRFS: zstd_init_dstream failed\n");
ret = -EIO;
goto done;
}
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -576,11 +570,11 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
while (1) {
size_t ret2;
- ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto done;
}
@@ -589,7 +583,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->out_buf.pos = 0;
ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
- buf_start, total_out, disk_start, orig_bio);
+ total_out - buf_start, cb, buf_start);
if (ret == 0)
break;
@@ -601,23 +595,24 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- kunmap(pages_in[page_in_index++]);
+ kunmap_local(workspace->in_buf.src);
+ page_in_index++;
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
}
ret = 0;
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
done:
if (workspace->in_buf.src)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(workspace->in_buf.src);
return ret;
}
@@ -626,17 +621,16 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- ZSTD_DStream *stream;
+ zstd_dstream *stream;
int ret = 0;
size_t ret2;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- char *kaddr;
- stream = ZSTD_initDStream(
+ stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (!stream) {
- pr_warn("BTRFS: ZSTD_initDStream failed\n");
+ pr_warn("BTRFS: zstd_init_dstream failed\n");
ret = -EIO;
goto finish;
}
@@ -660,15 +654,15 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
/* Check if the frame is over and we still need more input */
if (ret2 == 0) {
- pr_debug("BTRFS: ZSTD_decompressStream ended early\n");
+ pr_debug("BTRFS: zstd_decompress_stream ended early\n");
ret = -EIO;
goto finish;
}
- ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+ ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (ZSTD_isError(ret2)) {
- pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
- ZSTD_getErrorCode(ret2));
+ if (zstd_is_error(ret2)) {
+ pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
+ zstd_get_error_code(ret2));
ret = -EIO;
goto finish;
}
@@ -688,19 +682,15 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
bytes = min_t(unsigned long, destlen - pg_offset,
workspace->out_buf.size - buf_offset);
- kaddr = kmap_atomic(dest_page);
- memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset,
- bytes);
- kunmap_atomic(kaddr);
+ memcpy_to_page(dest_page, pg_offset,
+ workspace->out_buf.dst + buf_offset, bytes);
pg_offset += bytes;
}
ret = 0;
finish:
if (pg_offset < destlen) {
- kaddr = kmap_atomic(dest_page);
- memset(kaddr + pg_offset, 0, destlen - pg_offset);
- kunmap_atomic(kaddr);
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
}
return ret;
}