aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig.binfmt15
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/cmservice.c25
-rw-r--r--fs/afs/dir.c10
-rw-r--r--fs/afs/write.c18
-rw-r--r--fs/binfmt_em86.c110
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/backref.c6
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/ordered-data.h3
-rw-r--r--fs/btrfs/qgroup.c38
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c20
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/volumes.c1
-rw-r--r--fs/btrfs/zoned.c12
-rw-r--r--fs/ceph/mds_client.c2
-rw-r--r--fs/cifs/cifssmb.c10
-rw-r--r--fs/cifs/connect.c4
-rw-r--r--fs/cifs/dfs_cache.c229
-rw-r--r--fs/cifs/dfs_cache.h3
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/fs_context.c14
-rw-r--r--fs/cifs/smb2ops.c48
-rw-r--r--fs/ext2/dir.c12
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/namei.c4
-rw-r--r--fs/fs-writeback.c3
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/internal.h1
-rw-r--r--fs/io-wq.c7
-rw-r--r--fs/io_uring.c93
-rw-r--r--fs/ocfs2/file.c103
-rw-r--r--fs/pipe.c29
-rw-r--r--fs/reiserfs/stree.c31
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/seq_file.c3
-rw-r--r--fs/userfaultfd.c26
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h11
-rw-r--r--fs/xfs/xfs_buf_item_recover.c15
-rw-r--r--fs/xfs/xfs_inode_item_recover.c39
-rw-r--r--fs/xfs/xfs_log.c251
-rw-r--r--fs/xfs/xfs_log_cil.c13
-rw-r--r--fs/xfs/xfs_log_priv.h16
-rw-r--r--fs/xfs/xfs_trace.h5
51 files changed, 834 insertions, 469 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 06fb7a93a1bd..4d5ae61580aa 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -168,21 +168,6 @@ config OSF4_COMPAT
with v4 shared libraries freely available from Compaq. If you're
going to use shared libraries from Tru64 version 5.0 or later, say N.
-config BINFMT_EM86
- tristate "Kernel support for Linux/Intel ELF binaries"
- depends on ALPHA
- help
- Say Y here if you want to be able to execute Linux/Intel ELF
- binaries just like native Alpha binaries on your Alpha machine. For
- this to work, you need to have the emulator /usr/bin/em86 in place.
-
- You can get the same functionality by saying N here and saying Y to
- "Kernel support for MISC binaries".
-
- You may answer M to compile the emulation support as a module and
- later load the module when you want to use a Linux/Intel binary. The
- module will be called binfmt_em86. If unsure, say Y.
-
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
help
diff --git a/fs/Makefile b/fs/Makefile
index 9c708e1fbe8f..f98f3e691c37 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,7 +39,6 @@ obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
-obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d3c6bb22c5f4..a3f5de28be79 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -29,16 +29,11 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
static int afs_deliver_yfs_cb_callback(struct afs_call *);
-#define CM_NAME(name) \
- char afs_SRXCB##name##_name[] __tracepoint_string = \
- "CB." #name
-
/*
* CB.CallBack operation type
*/
-static CM_NAME(CallBack);
static const struct afs_call_type afs_SRXCBCallBack = {
- .name = afs_SRXCBCallBack_name,
+ .name = "CB.CallBack",
.deliver = afs_deliver_cb_callback,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_CallBack,
@@ -47,9 +42,8 @@ static const struct afs_call_type afs_SRXCBCallBack = {
/*
* CB.InitCallBackState operation type
*/
-static CM_NAME(InitCallBackState);
static const struct afs_call_type afs_SRXCBInitCallBackState = {
- .name = afs_SRXCBInitCallBackState_name,
+ .name = "CB.InitCallBackState",
.deliver = afs_deliver_cb_init_call_back_state,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_InitCallBackState,
@@ -58,9 +52,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState = {
/*
* CB.InitCallBackState3 operation type
*/
-static CM_NAME(InitCallBackState3);
static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
- .name = afs_SRXCBInitCallBackState3_name,
+ .name = "CB.InitCallBackState3",
.deliver = afs_deliver_cb_init_call_back_state3,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_InitCallBackState,
@@ -69,9 +62,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
/*
* CB.Probe operation type
*/
-static CM_NAME(Probe);
static const struct afs_call_type afs_SRXCBProbe = {
- .name = afs_SRXCBProbe_name,
+ .name = "CB.Probe",
.deliver = afs_deliver_cb_probe,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_Probe,
@@ -80,9 +72,8 @@ static const struct afs_call_type afs_SRXCBProbe = {
/*
* CB.ProbeUuid operation type
*/
-static CM_NAME(ProbeUuid);
static const struct afs_call_type afs_SRXCBProbeUuid = {
- .name = afs_SRXCBProbeUuid_name,
+ .name = "CB.ProbeUuid",
.deliver = afs_deliver_cb_probe_uuid,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_ProbeUuid,
@@ -91,9 +82,8 @@ static const struct afs_call_type afs_SRXCBProbeUuid = {
/*
* CB.TellMeAboutYourself operation type
*/
-static CM_NAME(TellMeAboutYourself);
static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
- .name = afs_SRXCBTellMeAboutYourself_name,
+ .name = "CB.TellMeAboutYourself",
.deliver = afs_deliver_cb_tell_me_about_yourself,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_TellMeAboutYourself,
@@ -102,9 +92,8 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
/*
* YFS CB.CallBack operation type
*/
-static CM_NAME(YFS_CallBack);
static const struct afs_call_type afs_SRXYFSCB_CallBack = {
- .name = afs_SRXCBYFS_CallBack_name,
+ .name = "YFSCB.CallBack",
.deliver = afs_deliver_yfs_cb_callback,
.destructor = afs_cm_destructor,
.work = SRXAFSCB_CallBack,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 78719f2f567e..ac829e63c570 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -656,7 +656,6 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
return ret;
}
- ret = -ENOENT;
if (!cookie.found) {
_leave(" = -ENOENT [not found]");
return -ENOENT;
@@ -2020,17 +2019,20 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (d_count(new_dentry) > 2) {
/* copy the target dentry's name */
- ret = -ENOMEM;
op->rename.tmp = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
- if (!op->rename.tmp)
+ if (!op->rename.tmp) {
+ op->error = -ENOMEM;
goto error;
+ }
ret = afs_sillyrename(new_dvnode,
AFS_FS_I(d_inode(new_dentry)),
new_dentry, op->key);
- if (ret)
+ if (ret) {
+ op->error = ret;
goto error;
+ }
op->dentry_2 = op->rename.tmp;
op->rename.rehash = NULL;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3104b62c2082..c0534697268e 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -771,14 +771,20 @@ int afs_writepages(struct address_space *mapping,
if (wbc->range_cyclic) {
start = mapping->writeback_index * PAGE_SIZE;
ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
- if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
- ret = afs_writepages_region(mapping, wbc, 0, start,
- &next);
- mapping->writeback_index = next / PAGE_SIZE;
+ if (ret == 0) {
+ mapping->writeback_index = next / PAGE_SIZE;
+ if (start > 0 && wbc->nr_to_write > 0) {
+ ret = afs_writepages_region(mapping, wbc, 0,
+ start, &next);
+ if (ret == 0)
+ mapping->writeback_index =
+ next / PAGE_SIZE;
+ }
+ }
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
- if (wbc->nr_to_write > 0)
- mapping->writeback_index = next;
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = next / PAGE_SIZE;
} else {
ret = afs_writepages_region(mapping, wbc,
wbc->range_start, wbc->range_end, &next);
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
deleted file mode 100644
index 06b9b9fddf70..000000000000
--- a/fs/binfmt_em86.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/binfmt_em86.c
- *
- * Based on linux/fs/binfmt_script.c
- * Copyright (C) 1996 Martin von Löwis
- * original #!-checking implemented by tytso.
- *
- * em86 changes Copyright (C) 1997 Jim Paradis
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/binfmts.h>
-#include <linux/elf.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/errno.h>
-
-
-#define EM86_INTERP "/usr/bin/em86"
-#define EM86_I_NAME "em86"
-
-static int load_em86(struct linux_binprm *bprm)
-{
- const char *i_name, *i_arg;
- char *interp;
- struct file * file;
- int retval;
- struct elfhdr elf_ex;
-
- /* Make sure this is a Linux/Intel ELF executable... */
- elf_ex = *((struct elfhdr *)bprm->buf);
-
- if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
- return -ENOEXEC;
-
- /* First of all, some simple consistency checks */
- if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
- (!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
- !bprm->file->f_op->mmap) {
- return -ENOEXEC;
- }
-
- /* Need to be able to load the file after exec */
- if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
- return -ENOENT;
-
- /* Unlike in the script case, we don't have to do any hairy
- * parsing to find our interpreter... it's hardcoded!
- */
- interp = EM86_INTERP;
- i_name = EM86_I_NAME;
- i_arg = NULL; /* We reserve the right to add an arg later */
-
- /*
- * Splice in (1) the interpreter's name for argv[0]
- * (2) (optional) argument to interpreter
- * (3) filename of emulated file (replace argv[0])
- *
- * This is done in reverse order, because of how the
- * user environment and arguments are stored.
- */
- remove_arg_zero(bprm);
- retval = copy_string_kernel(bprm->filename, bprm);
- if (retval < 0) return retval;
- bprm->argc++;
- if (i_arg) {
- retval = copy_string_kernel(i_arg, bprm);
- if (retval < 0) return retval;
- bprm->argc++;
- }
- retval = copy_string_kernel(i_name, bprm);
- if (retval < 0) return retval;
- bprm->argc++;
-
- /*
- * OK, now restart the process with the interpreter's inode.
- * Note that we use open_exec() as the name is now in kernel
- * space, and we don't need to copy it.
- */
- file = open_exec(interp);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- bprm->interpreter = file;
- return 0;
-}
-
-static struct linux_binfmt em86_format = {
- .module = THIS_MODULE,
- .load_binary = load_em86,
-};
-
-static int __init init_em86_binfmt(void)
-{
- register_binfmt(&em86_format);
- return 0;
-}
-
-static void __exit exit_em86_binfmt(void)
-{
- unregister_binfmt(&em86_format);
-}
-
-core_initcall(init_em86_binfmt);
-module_exit(exit_em86_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0c424a0cadaa..9ef4f1fc2cb0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -812,6 +812,8 @@ static void bdev_free_inode(struct inode *inode)
free_percpu(bdev->bd_stats);
kfree(bdev->bd_meta_info);
+ if (!bdev_is_partition(bdev))
+ kfree(bdev->bd_disk);
kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7a8a2fc19533..78b202d198b8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1488,15 +1488,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
- bool ignore_offset)
+ bool ignore_offset, bool skip_commit_root_sem)
{
int ret;
- if (!trans)
+ if (!trans && !skip_commit_root_sem)
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
time_seq, roots, ignore_offset);
- if (!trans)
+ if (!trans && !skip_commit_root_sem)
up_read(&fs_info->commit_root_sem);
return ret;
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 17abde7f794c..ff5f07f9940b 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -47,7 +47,8 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots, bool ignore_offset);
+ u64 time_seq, struct ulist **roots, bool ignore_offset,
+ bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9a023ae0f98b..30d82cdf128c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -352,7 +352,7 @@ static void end_compressed_bio_write(struct bio *bio)
btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- bio->bi_status == BLK_STS_OK);
+ !cb->errors);
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 06bc842ecdb3..ca848b183474 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -974,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(fs_info, record);
+ btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
@@ -1069,7 +1069,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b117dd3b8172..a59ab7b9aea0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -209,7 +209,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
+ const int num_pages = num_extent_pages(buf);
const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d296483d148f..268ce58d4569 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6019,6 +6019,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
mutex_lock(&fs_info->fs_devices->device_list_mutex);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f60314c36c5..0117d867ecf8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2992,7 +2992,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->disk)
+ if (ordered_extent->bdev)
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_free_io_failure_record(inode, start, end);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6eb41b7c0c84..5c0f8481e25e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -190,8 +190,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
- entry->disk = NULL;
- entry->partno = (u8)-1;
ASSERT(type == BTRFS_ORDERED_REGULAR ||
type == BTRFS_ORDERED_NOCOW ||
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 566472004edd..b2d88aba8420 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -145,8 +145,7 @@ struct btrfs_ordered_extent {
* command in a workqueue context
*/
u64 physical;
- struct gendisk *disk;
- u8 partno;
+ struct block_device *bdev;
};
/*
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 07ec06d4e972..0fa121171ca1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1704,17 +1704,39 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct ulist *old_root;
u64 bytenr = qrecord->bytenr;
int ret;
- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
+ /*
+ * We are always called in a context where we are already holding a
+ * transaction handle. Often we are called when adding a data delayed
+ * reference from btrfs_truncate_inode_items() (truncating or unlinking),
+ * in which case we will be holding a write lock on extent buffer from a
+ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
+ * acquire fs_info->commit_root_sem, because that is a higher level lock
+ * that must be acquired before locking any extent buffers.
+ *
+ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
+ * but we can't pass it a non-NULL transaction handle, because otherwise
+ * it would not use commit roots and would lock extent buffers, causing
+ * a deadlock if it ends up trying to read lock the same extent buffer
+ * that was previously write locked at btrfs_truncate_inode_items().
+ *
+ * So pass a NULL transaction handle to btrfs_find_all_roots() and
+ * explicitly tell it to not acquire the commit_root_sem - if we are
+ * holding a transaction handle we don't need its protection.
+ */
+ ASSERT(trans != NULL);
+
+ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
+ false, true);
if (ret < 0) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_warn(fs_info,
+ trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -1758,7 +1780,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
}
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -2629,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
- &record->old_roots, false);
+ &record->old_roots, false, false);
if (ret < 0)
goto cleanup;
}
@@ -2645,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -3179,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
num_bytes = found.offset;
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
- &roots, false);
+ &roots, false, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7283e4f549af..880e9df0dac1 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -298,7 +298,7 @@ int btrfs_qgroup_trace_extent_nolock(
* using current root, then we can move all expensive backref walk out of
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
/*
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index f3137285a9e2..98b5aaba46f1 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -224,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
* quota.
*/
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -237,7 +237,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -261,7 +261,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -273,7 +273,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return -EINVAL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -325,7 +325,7 @@ static int test_multiple_refs(struct btrfs_root *root,
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -338,7 +338,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -360,7 +360,7 @@ static int test_multiple_refs(struct btrfs_root *root,
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -373,7 +373,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -401,7 +401,7 @@ static int test_multiple_refs(struct btrfs_root *root,
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -414,7 +414,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ false, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index dc6eb088d73e..e6430ac9bbe8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5526,16 +5526,29 @@ log_extents:
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
/*
- * Don't update last_log_commit if we logged that an inode exists
- * after it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode,
- * then the inode gets evicted after all delalloc was flushed,
- * then we log it exists (due to a rename for example) and then
- * fsync it. This last fsync would do nothing (not logging the
- * extents previously written).
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for two reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
*/
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
}
@@ -6490,8 +6503,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans < trans->transid &&
- (!old_dir || old_dir->logged_trans < trans->transid))
+ if (!inode_logged(trans, inode) &&
+ (!old_dir || !inode_logged(trans, old_dir)))
return;
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1e4d43ffe38b..70f94b75f25a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1078,6 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 297c0b1c0634..907c2cc45c9c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1349,8 +1349,7 @@ void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
return;
ordered->physical = physical;
- ordered->disk = bio->bi_bdev->bd_disk;
- ordered->partno = bio->bi_bdev->bd_partno;
+ ordered->bdev = bio->bi_bdev;
btrfs_put_ordered_extent(ordered);
}
@@ -1362,18 +1361,16 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_ordered_sum *sum;
- struct block_device *bdev;
u64 orig_logical = ordered->disk_bytenr;
u64 *logical = NULL;
int nr, stripe_len;
/* Zoned devices should not have partitions. So, we can assume it is 0 */
- ASSERT(ordered->partno == 0);
- bdev = bdgrab(ordered->disk->part0);
- if (WARN_ON(!bdev))
+ ASSERT(!bdev_is_partition(ordered->bdev));
+ if (WARN_ON(!ordered->bdev))
return;
- if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
ordered->physical, &logical, &nr,
&stripe_len)))
goto out;
@@ -1402,7 +1399,6 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
out:
kfree(logical);
- bdput(bdev);
}
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a818213c972f..9db1b39df773 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4456,7 +4456,7 @@ bool check_session_state(struct ceph_mds_session *s)
break;
case CEPH_MDS_SESSION_CLOSING:
/* Should never reach this when we're unmounting */
- WARN_ON_ONCE(true);
+ WARN_ON_ONCE(s->s_ttl);
fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f72e3b3dca69..65d1a65bfc37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -873,8 +873,11 @@ PsxDelete:
InformationLevel) - 4;
offset = param_offset + params;
- /* Setup pointer to Request Data (inode type) */
- pRqD = (struct unlink_psx_rq *)(((char *)&pSMB->hdr.Protocol) + offset);
+ /* Setup pointer to Request Data (inode type).
+ * Note that SMB offsets are from the beginning of SMB which is 4 bytes
+ * in, after RFC1001 field
+ */
+ pRqD = (struct unlink_psx_rq *)((char *)(pSMB) + offset + 4);
pRqD->type = cpu_to_le16(type);
pSMB->ParameterOffset = cpu_to_le16(param_offset);
pSMB->DataOffset = cpu_to_le16(offset);
@@ -1081,7 +1084,8 @@ PsxCreat:
param_offset = offsetof(struct smb_com_transaction2_spi_req,
InformationLevel) - 4;
offset = param_offset + params;
- pdata = (OPEN_PSX_REQ *)(((char *)&pSMB->hdr.Protocol) + offset);
+ /* SMB offsets are from the beginning of SMB which is 4 bytes in, after RFC1001 field */
+ pdata = (OPEN_PSX_REQ *)((char *)(pSMB) + offset + 4);
pdata->Level = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
pdata->Permissions = cpu_to_le64(mode);
pdata->PosixOpenFlags = cpu_to_le32(posix_flags);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1b04d6ec14dd..3781eee9360a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -220,7 +220,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
#ifdef CONFIG_CIFS_DFS_UPCALL
struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
- struct dfs_cache_tgt_list tgt_list = {0};
+ struct dfs_cache_tgt_list tgt_list = DFS_CACHE_TGT_LIST_INIT(tgt_list);
struct dfs_cache_tgt_iterator *tgt_it = NULL;
#endif
@@ -3130,7 +3130,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
{
int rc;
char *npath = NULL;
- struct dfs_cache_tgt_list tgt_list = {0};
+ struct dfs_cache_tgt_list tgt_list = DFS_CACHE_TGT_LIST_INIT(tgt_list);
struct dfs_cache_tgt_iterator *tgt_it = NULL;
struct smb3_fs_context tmp_ctx = {NULL};
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 7c1769714609..283745592844 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -19,6 +19,7 @@
#include "cifs_debug.h"
#include "cifs_unicode.h"
#include "smb2glob.h"
+#include "dns_resolve.h"
#include "dfs_cache.h"
@@ -911,6 +912,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl)
err_free_it:
list_for_each_entry_safe(it, nit, head, it_list) {
+ list_del(&it->it_list);
kfree(it->it_name);
kfree(it);
}
@@ -1293,6 +1295,194 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
return 0;
}
+static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, const char *s2)
+{
+ char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
+ const char *host;
+ size_t hostlen;
+ char *ip = NULL;
+ struct sockaddr sa;
+ bool match;
+ int rc;
+
+ if (strcasecmp(s1, s2))
+ return false;
+
+ /*
+ * Resolve share's hostname and check if server address matches. Otherwise just ignore it
+ * as we could not have upcall to resolve hostname or failed to convert ip address.
+ */
+ match = true;
+ extract_unc_hostname(s1, &host, &hostlen);
+ scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host);
+
+ rc = dns_resolve_server_name_to_ip(unc, &ip, NULL);
+ if (rc < 0) {
+ cifs_dbg(FYI, "%s: could not resolve %.*s. assuming server address matches.\n",
+ __func__, (int)hostlen, host);
+ return true;
+ }
+
+ if (!cifs_convert_address(&sa, ip, strlen(ip))) {
+ cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n",
+ __func__, ip);
+ } else {
+ mutex_lock(&server->srv_mutex);
+ match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa);
+ mutex_unlock(&server->srv_mutex);
+ }
+
+ kfree(ip);
+ return match;
+}
+
+/*
+ * Mark dfs tcon for reconnecting when the currently connected tcon does not match any of the new
+ * target shares in @refs.
+ */
+static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cache_tgt_list *tl,
+ const struct dfs_info3_param *refs, int numrefs)
+{
+ struct dfs_cache_tgt_iterator *it;
+ int i;
+
+ for (it = dfs_cache_get_tgt_iterator(tl); it; it = dfs_cache_get_next_tgt(tl, it)) {
+ for (i = 0; i < numrefs; i++) {
+ if (target_share_equal(tcon->ses->server, dfs_cache_get_tgt_name(it),
+ refs[i].node_name))
+ return;
+ }
+ }
+
+ cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
+ for (i = 0; i < tcon->ses->chan_count; i++) {
+ spin_lock(&GlobalMid_Lock);
+ if (tcon->ses->chans[i].server->tcpStatus != CifsExiting)
+ tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&GlobalMid_Lock);
+ }
+}
+
+/* Refresh dfs referral of tcon and mark it for reconnect if needed */
+static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool force_refresh)
+{
+ const char *path = tcon->dfs_path + 1;
+ struct cifs_ses *ses;
+ struct cache_entry *ce;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+ bool needs_refresh = false;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ int rc = 0;
+ unsigned int xid;
+
+ ses = find_ipc_from_server_path(sessions, path);
+ if (IS_ERR(ses)) {
+ cifs_dbg(FYI, "%s: could not find ipc session\n", __func__);
+ return PTR_ERR(ses);
+ }
+
+ down_read(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
+ if (!IS_ERR(ce)) {
+ rc = get_targets(ce, &tl);
+ if (rc)
+ cifs_dbg(FYI, "%s: could not get dfs targets: %d\n", __func__, rc);
+ }
+ up_read(&htable_rw_lock);
+
+ if (!needs_refresh) {
+ rc = 0;
+ goto out;
+ }
+
+ xid = get_xid();
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
+ free_xid(xid);
+
+ /* Create or update a cache entry with the new referral */
+ if (!rc) {
+ dump_refs(refs, numrefs);
+
+ down_write(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ if (IS_ERR(ce))
+ add_cache_entry_locked(refs, numrefs);
+ else if (force_refresh || cache_entry_expired(ce))
+ update_cache_entry_locked(ce, refs, numrefs);
+ up_write(&htable_rw_lock);
+
+ mark_for_reconnect_if_needed(tcon, &tl, refs, numrefs);
+ }
+
+out:
+ dfs_cache_free_tgts(&tl);
+ free_dfs_info_array(refs, numrefs);
+ return rc;
+}
+
+/**
+ * dfs_cache_remount_fs - remount a DFS share
+ *
+ * Reconfigure dfs mount by forcing a new DFS referral and if the currently cached targets do not
+ * match any of the new targets, mark it for reconnect.
+ *
+ * @cifs_sb: cifs superblock.
+ *
+ * Return zero if remounted, otherwise non-zero.
+ */
+int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
+{
+ struct cifs_tcon *tcon;
+ struct mount_group *mg;
+ struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
+ int rc;
+
+ if (!cifs_sb || !cifs_sb->master_tlink)
+ return -EINVAL;
+
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (!tcon->dfs_path) {
+ cifs_dbg(FYI, "%s: not a dfs tcon\n", __func__);
+ return 0;
+ }
+
+ if (uuid_is_null(&cifs_sb->dfs_mount_id)) {
+ cifs_dbg(FYI, "%s: tcon has no dfs mount group id\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&mount_group_list_lock);
+ mg = find_mount_group_locked(&cifs_sb->dfs_mount_id);
+ if (IS_ERR(mg)) {
+ mutex_unlock(&mount_group_list_lock);
+ cifs_dbg(FYI, "%s: tcon has ipc session to refresh referral\n", __func__);
+ return PTR_ERR(mg);
+ }
+ kref_get(&mg->refcount);
+ mutex_unlock(&mount_group_list_lock);
+
+ spin_lock(&mg->lock);
+ memcpy(&sessions, mg->sessions, mg->num_sessions * sizeof(mg->sessions[0]));
+ spin_unlock(&mg->lock);
+
+ /*
+ * After reconnecting to a different server, unique ids won't match anymore, so we disable
+ * serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
+ */
+ cifs_autodisable_serverino(cifs_sb);
+ /*
+ * Force the use of prefix path to support failover on DFS paths that resolve to targets
+ * that have different prefix paths.
+ */
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = refresh_tcon(sessions, tcon, true);
+
+ kref_put(&mg->refcount, mount_group_release);
+ return rc;
+}
+
/*
* Refresh all active dfs mounts regardless of whether they are in cache or not.
* (cache can be cleared)
@@ -1303,7 +1493,6 @@ static void refresh_mounts(struct cifs_ses **sessions)
struct cifs_ses *ses;
struct cifs_tcon *tcon, *ntcon;
struct list_head tcons;
- unsigned int xid;
INIT_LIST_HEAD(&tcons);
@@ -1321,44 +1510,8 @@ static void refresh_mounts(struct cifs_ses **sessions)
spin_unlock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
- const char *path = tcon->dfs_path + 1;
- struct cache_entry *ce;
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
- bool needs_refresh = false;
- int rc = 0;
-
list_del_init(&tcon->ulist);
-
- ses = find_ipc_from_server_path(sessions, path);
- if (IS_ERR(ses))
- goto next_tcon;
-
- down_read(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- needs_refresh = IS_ERR(ce) || cache_entry_expired(ce);
- up_read(&htable_rw_lock);
-
- if (!needs_refresh)
- goto next_tcon;
-
- xid = get_xid();
- rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- free_xid(xid);
-
- /* Create or update a cache entry with the new referral */
- if (!rc) {
- down_write(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- if (IS_ERR(ce))
- add_cache_entry_locked(refs, numrefs);
- else if (cache_entry_expired(ce))
- update_cache_entry_locked(ce, refs, numrefs);
- up_write(&htable_rw_lock);
- }
-
-next_tcon:
- free_dfs_info_array(refs, numrefs);
+ refresh_tcon(sessions, tcon, false);
cifs_put_tcon(tcon);
}
}
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index b29d3ae64829..52070d1df189 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -13,6 +13,8 @@
#include <linux/uuid.h>
#include "cifsglob.h"
+#define DFS_CACHE_TGT_LIST_INIT(var) { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), }
+
struct dfs_cache_tgt_list {
int tl_numtgts;
struct list_head tl_list;
@@ -44,6 +46,7 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id);
void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses);
char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap);
+int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb);
static inline struct dfs_cache_tgt_iterator *
dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd108607a070..0a72840a88f1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4619,7 +4619,7 @@ read_complete:
static int cifs_readpage(struct file *file, struct page *page)
{
- loff_t offset = (loff_t)page->index << PAGE_SHIFT;
+ loff_t offset = page_file_offset(page);
int rc = -EACCES;
unsigned int xid;
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 553adfbcc22a..eed59bc1d913 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -13,6 +13,9 @@
#include <linux/magic.h>
#include <linux/security.h>
#include <net/net_namespace.h>
+#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs_cache.h"
+#endif
*/
#include <linux/ctype.h>
@@ -779,6 +782,10 @@ static int smb3_reconfigure(struct fs_context *fc)
smb3_cleanup_fs_context_contents(cifs_sb->ctx);
rc = smb3_fs_context_dup(cifs_sb->ctx, ctx);
smb3_update_mnt_flags(cifs_sb);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (!rc)
+ rc = dfs_cache_remount_fs(cifs_sb);
+#endif
return rc;
}
@@ -918,6 +925,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->cred_uid = uid;
ctx->cruid_specified = true;
break;
+ case Opt_backupuid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto cifs_parse_mount_err;
+ ctx->backupuid = uid;
+ ctx->backupuid_specified = true;
+ break;
case Opt_backupgid:
gid = make_kgid(current_user_ns(), result.uint_32);
if (!gid_valid(gid))
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index ba3c58e1f725..2dfd0d8297eb 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3618,6 +3618,7 @@ static int smb3_simple_fallocate_write_range(unsigned int xid,
{
struct cifs_io_parms io_parms = {0};
int nbytes;
+ int rc = 0;
struct kvec iov[2];
io_parms.netfid = cfile->fid.netfid;
@@ -3625,13 +3626,25 @@ static int smb3_simple_fallocate_write_range(unsigned int xid,
io_parms.tcon = tcon;
io_parms.persistent_fid = cfile->fid.persistent_fid;
io_parms.volatile_fid = cfile->fid.volatile_fid;
- io_parms.offset = off;
- io_parms.length = len;
- /* iov[0] is reserved for smb header */
- iov[1].iov_base = buf;
- iov[1].iov_len = io_parms.length;
- return SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+ while (len) {
+ io_parms.offset = off;
+ io_parms.length = len;
+ if (io_parms.length > SMB2_MAX_BUFFER_SIZE)
+ io_parms.length = SMB2_MAX_BUFFER_SIZE;
+ /* iov[0] is reserved for smb header */
+ iov[1].iov_base = buf;
+ iov[1].iov_len = io_parms.length;
+ rc = SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+ if (rc)
+ break;
+ if (nbytes > len)
+ return -EINVAL;
+ buf += nbytes;
+ off += nbytes;
+ len -= nbytes;
+ }
+ return rc;
}
static int smb3_simple_fallocate_range(unsigned int xid,
@@ -3655,11 +3668,6 @@ static int smb3_simple_fallocate_range(unsigned int xid,
(char **)&out_data, &out_data_len);
if (rc)
goto out;
- /*
- * It is already all allocated
- */
- if (out_data_len == 0)
- goto out;
buf = kzalloc(1024 * 1024, GFP_KERNEL);
if (buf == NULL) {
@@ -3782,6 +3790,24 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
goto out;
}
+ if (keep_size == true) {
+ /*
+ * We can not preallocate pages beyond the end of the file
+ * in SMB2
+ */
+ if (off >= i_size_read(inode)) {
+ rc = 0;
+ goto out;
+ }
+ /*
+ * For fallocates that are partially beyond the end of file,
+ * clamp len so we only fallocate up to the end of file.
+ */
+ if (off + len > i_size_read(inode)) {
+ len = i_size_read(inode) - off;
+ }
+ }
+
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
/*
* At this point, we are trying to fallocate an internal
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 14292dba3a12..2c2f179b6977 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -106,12 +106,11 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
return err;
}
-static bool ext2_check_page(struct page *page, int quiet)
+static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
unsigned chunk_size = ext2_chunk_size(dir);
- char *kaddr = page_address(page);
u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
unsigned offs, rec_len;
unsigned limit = PAGE_SIZE;
@@ -205,7 +204,8 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n,
if (!IS_ERR(page)) {
*page_addr = kmap_local_page(page);
if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !ext2_check_page(page, quiet))
+ if (PageError(page) || !ext2_check_page(page, quiet,
+ *page_addr))
goto fail;
}
}
@@ -584,10 +584,10 @@ out_unlock:
* ext2_delete_entry deletes a directory entry by merging it with the
* previous entry. Page is up-to-date.
*/
-int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
+int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page,
+ char *kaddr)
{
struct inode *inode = page->mapping->host;
- char *kaddr = page_address(page);
unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
unsigned to = ((char *)dir - kaddr) +
ext2_rec_len_from_disk(dir->rec_len);
@@ -607,7 +607,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
de = ext2_next_entry(de);
}
if (pde)
- from = (char*)pde - (char*)page_address(page);
+ from = (char *)pde - kaddr;
pos = page_offset(page) + from;
lock_page(page);
err = ext2_prepare_chunk(page, pos, to - from);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index b0a694820cb7..e512630cb63e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -740,7 +740,8 @@ extern int ext2_inode_by_name(struct inode *dir,
extern int ext2_make_empty(struct inode *, struct inode *);
extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
struct page **, void **res_page_addr);
-extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
+extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page,
+ char *kaddr);
extern int ext2_empty_dir (struct inode *);
extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa);
extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, void *,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 1f69b81655b6..5f6b7560eb3f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -293,7 +293,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
goto out;
}
- err = ext2_delete_entry (de, page);
+ err = ext2_delete_entry (de, page, page_addr);
ext2_put_page(page, page_addr);
if (err)
goto out;
@@ -397,7 +397,7 @@ static int ext2_rename (struct user_namespace * mnt_userns,
old_inode->i_ctime = current_time(old_inode);
mark_inode_dirty(old_inode);
- ext2_delete_entry(old_de, old_page);
+ ext2_delete_entry(old_de, old_page, old_page_addr);
if (dir_de) {
if (old_dir != new_dir)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 06d04a74ab6c..4c3370548982 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -521,6 +521,9 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
*/
smp_mb();
+ if (IS_DAX(inode))
+ return false;
+
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 926eeb9bf4eb..cdfb1ae78a3f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -77,7 +77,7 @@ enum hugetlb_param {
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
fsparam_u32 ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
- fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
diff --git a/fs/internal.h b/fs/internal.h
index 3ce8edbaa3ca..82e8eb32ff3d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,7 +61,6 @@ extern void __init chrdev_init(void);
*/
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
-extern void fc_drop_locked(struct fs_context *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 843d4a7bcd6e..cf086b01c6c6 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -731,7 +731,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
int work_flags;
unsigned long flags;
- if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) {
+ /*
+ * If io-wq is exiting for this task, or if the request has explicitly
+ * been marked as one that should not get executed, cancel it here.
+ */
+ if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
+ (work->flags & IO_WQ_WORK_CANCEL)) {
io_run_cancel(work, wqe);
return;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0cac361bf6b8..bf548af0426c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1279,8 +1279,17 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_for_each_link(cur, req)
- io_prep_async_work(cur);
+ if (req->flags & REQ_F_LINK_TIMEOUT) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock_irq(&ctx->completion_lock);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
+ spin_unlock_irq(&ctx->completion_lock);
+ } else {
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
+ }
}
static void io_queue_async_work(struct io_kiocb *req)
@@ -1294,6 +1303,17 @@ static void io_queue_async_work(struct io_kiocb *req)
/* init ->work of the whole link before punting */
io_prep_async_link(req);
+
+ /*
+ * Not expected to happen, but if we do have a bug where this _can_
+ * happen, catch it here and ensure the request is marked as
+ * canceled. That will make io-wq go through the usual work cancel
+ * procedure rather than attempt to run this request (or create a new
+ * worker for it).
+ */
+ if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
io_wq_enqueue(tctx->io_wq, &req->work);
@@ -1939,9 +1959,13 @@ static void tctx_task_work(struct callback_head *cb)
node = next;
}
if (wq_list_empty(&tctx->task_list)) {
+ spin_lock_irq(&tctx->task_lock);
clear_bit(0, &tctx->task_state);
- if (wq_list_empty(&tctx->task_list))
+ if (wq_list_empty(&tctx->task_list)) {
+ spin_unlock_irq(&tctx->task_lock);
break;
+ }
+ spin_unlock_irq(&tctx->task_lock);
/* another tctx_task_work() is enqueued, yield */
if (test_and_set_bit(0, &tctx->task_state))
break;
@@ -2036,6 +2060,12 @@ static void io_req_task_queue(struct io_kiocb *req)
io_req_task_work_add(req);
}
+static void io_req_task_queue_reissue(struct io_kiocb *req)
+{
+ req->io_task_work.func = io_queue_async_work;
+ io_req_task_work_add(req);
+}
+
static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2205,7 +2235,7 @@ static inline bool io_run_task_work(void)
* Find and free completed poll iocbs
*/
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- struct list_head *done)
+ struct list_head *done, bool resubmit)
{
struct req_batch rb;
struct io_kiocb *req;
@@ -2220,11 +2250,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
- if (READ_ONCE(req->result) == -EAGAIN &&
+ if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
!(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
req_ref_get(req);
- io_queue_async_work(req);
+ io_req_task_queue_reissue(req);
continue;
}
@@ -2244,7 +2274,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
+ long min, bool resubmit)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
@@ -2287,7 +2317,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
if (!list_empty(&done))
- io_iopoll_complete(ctx, nr_events, &done);
+ io_iopoll_complete(ctx, nr_events, &done, resubmit);
return ret;
}
@@ -2305,7 +2335,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
while (!list_empty(&ctx->iopoll_list)) {
unsigned int nr_events = 0;
- io_do_iopoll(ctx, &nr_events, 0);
+ io_do_iopoll(ctx, &nr_events, 0, false);
/* let it sleep and repeat later if can't complete a request */
if (nr_events == 0)
@@ -2367,7 +2397,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, &nr_events, min);
+ ret = io_do_iopoll(ctx, &nr_events, min, true);
} while (!ret && nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
@@ -2417,6 +2447,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
*/
if (percpu_ref_is_dying(&ctx->refs))
return false;
+ /*
+ * Play it safe and assume not safe to re-import and reissue if we're
+ * not in the original thread group (or in task context).
+ */
+ if (!same_thread_group(req->task, current) || !in_task())
+ return false;
return true;
}
#else
@@ -2747,7 +2783,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
req_ref_get(req);
- io_queue_async_work(req);
+ io_req_task_queue_reissue(req);
} else {
int cflags = 0;
@@ -4802,6 +4838,7 @@ IO_NETOP_FN(recv);
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
+ int nr_entries;
int error;
};
@@ -4902,7 +4939,6 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
if (req->poll.events & EPOLLONESHOT)
flags = 0;
if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
- io_poll_remove_waitqs(req);
req->poll.done = true;
flags = 0;
}
@@ -4925,6 +4961,7 @@ static void io_poll_task_func(struct io_kiocb *req)
done = io_poll_complete(req, req->result);
if (done) {
+ io_poll_remove_double(req);
hash_del(&req->hash_node);
} else {
req->result = 0;
@@ -4995,11 +5032,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
struct io_kiocb *req = pt->req;
/*
- * If poll->head is already set, it's because the file being polled
- * uses multiple waitqueues for poll handling (eg one for read, one
- * for write). Setup a separate io_poll_iocb if this happens.
+ * The file being polled uses multiple waitqueues for poll handling
+ * (e.g. one for read, one for write). Setup a separate io_poll_iocb
+ * if this happens.
*/
- if (unlikely(poll->head)) {
+ if (unlikely(pt->nr_entries)) {
struct io_poll_iocb *poll_one = poll;
/* already have a 2nd entry, fail a third attempt */
@@ -5027,7 +5064,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
*poll_ptr = poll;
}
- pt->error = 0;
+ pt->nr_entries++;
poll->head = head;
if (poll->events & EPOLLEXCLUSIVE)
@@ -5104,11 +5141,16 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
ipt->pt._key = mask;
ipt->req = req;
- ipt->error = -EINVAL;
+ ipt->error = 0;
+ ipt->nr_entries = 0;
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+ if (unlikely(!ipt->nr_entries) && !ipt->error)
+ ipt->error = -EINVAL;
spin_lock_irq(&ctx->completion_lock);
+ if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
+ io_poll_remove_double(req);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
if (unlikely(list_empty(&poll->wait.entry))) {
@@ -5179,7 +5221,6 @@ static int io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
if (ret || ipt.error) {
- io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
if (ret)
return IO_APOLL_READY;
@@ -6792,7 +6833,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, &nr_events, 0);
+ io_do_iopoll(ctx, &nr_events, 0, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
@@ -7899,15 +7940,19 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_wq_data data;
unsigned int concurrency;
+ mutex_lock(&ctx->uring_lock);
hash = ctx->hash_map;
if (!hash) {
hash = kzalloc(sizeof(*hash), GFP_KERNEL);
- if (!hash)
+ if (!hash) {
+ mutex_unlock(&ctx->uring_lock);
return ERR_PTR(-ENOMEM);
+ }
refcount_set(&hash->refs, 1);
init_waitqueue_head(&hash->wait);
ctx->hash_map = hash;
}
+ mutex_unlock(&ctx->uring_lock);
data.hash = hash;
data.task = task;
@@ -7981,9 +8026,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
f = fdget(p->wq_fd);
if (!f.file)
return -ENXIO;
- fdput(f);
- if (f.file->f_op != &io_uring_fops)
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
return -EINVAL;
+ }
+ fdput(f);
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct task_struct *tsk;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 775657943057..54d7843c0211 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
}
}
+/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ * is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u64 start_block, end_block, nr_blocks;
+ u64 p_block, offset;
+ u32 cluster, p_cluster, nr_clusters;
+ struct super_block *sb = inode->i_sb;
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+ if (start + len < end)
+ end = start + len;
+
+ start_block = ocfs2_blocks_for_bytes(sb, start);
+ end_block = ocfs2_blocks_for_bytes(sb, end);
+ nr_blocks = end_block - start_block;
+ if (!nr_blocks)
+ return 0;
+
+ cluster = ocfs2_bytes_to_clusters(sb, start);
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+ &nr_clusters, NULL);
+ if (ret)
+ return ret;
+ if (!p_cluster)
+ return 0;
+
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
static int ocfs2_zero_partial_clusters(struct inode *inode,
u64 start, u64 len)
{
@@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
unsigned int csize = osb->s_clustersize;
handle_t *handle;
+ loff_t isize = i_size_read(inode);
/*
* The "start" and "end" values are NOT necessarily part of
@@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
goto out;
+ /* No page cache for EOF blocks, issue zero out to disk. */
+ if (end > isize) {
+ /*
+ * zeroout eof blocks in last cluster starting from
+ * "isize" even "start" > "isize" because it is
+ * complicated to zeroout just at "start" as "start"
+ * may be not aligned with block size, buffer write
+ * would be required to do that, but out of eof buffer
+ * write is not supported.
+ */
+ ret = ocfs2_zeroout_partial_cluster(inode, isize,
+ end - isize);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (start >= isize)
+ goto out;
+ end = isize;
+ }
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -1856,45 +1916,6 @@ out:
}
/*
- * zero out partial blocks of one cluster.
- *
- * start: file offset where zero starts, will be made upper block aligned.
- * len: it will be trimmed to the end of current cluster if "start + len"
- * is bigger than it.
- */
-static int ocfs2_zeroout_partial_cluster(struct inode *inode,
- u64 start, u64 len)
-{
- int ret;
- u64 start_block, end_block, nr_blocks;
- u64 p_block, offset;
- u32 cluster, p_cluster, nr_clusters;
- struct super_block *sb = inode->i_sb;
- u64 end = ocfs2_align_bytes_to_clusters(sb, start);
-
- if (start + len < end)
- end = start + len;
-
- start_block = ocfs2_blocks_for_bytes(sb, start);
- end_block = ocfs2_blocks_for_bytes(sb, end);
- nr_blocks = end_block - start_block;
- if (!nr_blocks)
- return 0;
-
- cluster = ocfs2_bytes_to_clusters(sb, start);
- ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
- &nr_clusters, NULL);
- if (ret)
- return ret;
- if (!p_cluster)
- return 0;
-
- offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
- p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
- return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
-}
-
-/*
* Parts of this function taken from xfs_change_file_space()
*/
static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
@@ -1935,7 +1956,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- orig_isize = i_size_read(inode);
switch (sr->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -1943,7 +1963,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
sr->l_start += f_pos;
break;
case 2: /*SEEK_END*/
- sr->l_start += orig_isize;
+ sr->l_start += i_size_read(inode);
break;
default:
ret = -EINVAL;
@@ -1998,6 +2018,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
ret = -EINVAL;
}
+ orig_isize = i_size_read(inode);
/* zeroout eof blocks in the cluster. */
if (!ret && change_size && orig_isize < size) {
ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
diff --git a/fs/pipe.c b/fs/pipe.c
index bfd946a9ad01..8e6ef62aeb1c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -32,6 +32,21 @@
#include "internal.h"
/*
+ * New pipe buffers will be restricted to this size while the user is exceeding
+ * their pipe buffer quota. The general pipe use case needs at least two
+ * buffers: one for data yet to be read, and one for new data. If this is less
+ * than two, then a write to a non-empty pipe may block even if the pipe is not
+ * full. This can occur with GNU make jobserver or similar uses of pipes as
+ * semaphores: multiple processes may be waiting to write tokens back to the
+ * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
+ *
+ * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
+ * own risk, namely: pipe writes to non-full pipes may block until the pipe is
+ * emptied.
+ */
+#define PIPE_MIN_DEF_BUFFERS 2
+
+/*
* The max size that a non-root user is allowed to grow the pipe. Can
* be set by root in /proc/sys/fs/pipe-max-size
*/
@@ -429,20 +444,20 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
#endif
/*
- * Only wake up if the pipe started out empty, since
- * otherwise there should be no readers waiting.
+ * Epoll nonsensically wants a wakeup whether the pipe
+ * was already empty or not.
*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
- * page-aligs the rest of the writes for large writes
+ * page-aligns the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
- was_empty = pipe_empty(head, pipe->tail);
+ was_empty = true;
chars = total_len & (PAGE_SIZE-1);
- if (chars && !was_empty) {
+ if (chars && !pipe_empty(head, pipe->tail)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
@@ -781,8 +796,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
- user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
- pipe_bufs = 1;
+ user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
+ pipe_bufs = PIPE_MIN_DEF_BUFFERS;
}
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 476a7ff49482..ef42729216d1 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -387,6 +387,24 @@ void pathrelse(struct treepath *search_path)
search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
}
+static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
+{
+ struct reiserfs_de_head *deh;
+ int i;
+
+ deh = B_I_DEH(bh, ih);
+ for (i = 0; i < ih_entry_count(ih); i++) {
+ if (deh_location(&deh[i]) > ih_item_len(ih)) {
+ reiserfs_warning(NULL, "reiserfs-5094",
+ "directory entry location seems wrong %h",
+ &deh[i]);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
{
struct block_head *blkh;
@@ -454,11 +472,14 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
"(second one): %h", ih);
return 0;
}
- if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) {
- reiserfs_warning(NULL, "reiserfs-5093",
- "item entry count seems wrong %h",
- ih);
- return 0;
+ if (is_direntry_le_ih(ih)) {
+ if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
+ reiserfs_warning(NULL, "reiserfs-5093",
+ "item entry count seems wrong %h",
+ ih);
+ return 0;
+ }
+ return has_valid_deh_location(bh, ih);
}
prev_location = ih_location(ih);
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3ffafc73acf0..58481f8d63d5 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2082,6 +2082,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
unlock_new_inode(root_inode);
}
+ if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
+ !root_inode->i_size) {
+ SWARN(silent, s, "", "corrupt root inode, run fsck");
+ iput(root_inode);
+ errval = -EUCLEAN;
+ goto error;
+ }
+
s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto error;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index b117b212ef28..4a2cda04d3e2 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -32,6 +32,9 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
+ if (unlikely(size > MAX_RW_COUNT))
+ return NULL;
+
return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f6e0f0c0d0e5..5c2d806e6ae5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 *start, __u64 len)
+ __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- *start = untagged_addr(*start);
-
- if (*start & ~PAGE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (*start < mmap_min_addr)
+ if (start < mmap_min_addr)
return -EINVAL;
- if (*start >= task_size)
+ if (start >= task_size)
return -EINVAL;
- if (len > task_size - *start)
+ if (len > task_size - start)
return -EINVAL;
return 0;
}
@@ -1316,7 +1314,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_MINOR;
}
- ret = validate_range(mm, &uffdio_register.range.start,
+ ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1522,7 +1520,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, &uffdio_unregister.start,
+ ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1671,7 +1669,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1711,7 +1709,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1768,7 +1766,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
@@ -1818,7 +1816,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
sizeof(struct uffdio_writeprotect)))
return -EFAULT;
- ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+ ret = validate_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len);
if (ret)
return ret;
@@ -1866,7 +1864,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
sizeof(uffdio_continue) - (sizeof(__s64))))
goto out;
- ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+ ret = validate_range(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len);
if (ret)
goto out;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index d548ea4b6aab..2c5bcbc19264 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -411,7 +411,16 @@ struct xfs_log_dinode {
/* start of the extended dinode, writable fields */
uint32_t di_crc; /* CRC of the inode */
uint64_t di_changecount; /* number of attribute changes */
- xfs_lsn_t di_lsn; /* flush sequence */
+
+ /*
+ * The LSN we write to this field during formatting is not a reflection
+ * of the current on-disk LSN. It should never be used for recovery
+ * sequencing, nor should it be recovered into the on-disk inode at all.
+ * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
+ * for details.
+ */
+ xfs_lsn_t di_lsn;
+
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
uint8_t di_pad2[12]; /* more padding for future expansion */
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d44e8b4a3391..4775485b4062 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -698,7 +698,8 @@ xlog_recover_do_inode_buffer(
static xfs_lsn_t
xlog_recover_get_buf_lsn(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
{
uint32_t magic32;
uint16_t magic16;
@@ -706,11 +707,20 @@ xlog_recover_get_buf_lsn(
void *blk = bp->b_addr;
uuid_t *uuid;
xfs_lsn_t lsn = -1;
+ uint16_t blft;
/* v4 filesystems always recover immediately */
if (!xfs_sb_version_hascrc(&mp->m_sb))
goto recover_immediately;
+ /*
+ * realtime bitmap and summary file blocks do not have magic numbers or
+ * UUIDs, so we must recover them immediately.
+ */
+ blft = xfs_blft_from_flags(buf_f);
+ if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
+ goto recover_immediately;
+
magic32 = be32_to_cpu(*(__be32 *)blk);
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
@@ -796,6 +806,7 @@ xlog_recover_get_buf_lsn(
switch (magicda) {
case XFS_DIR3_LEAF1_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
case XFS_DA3_NODE_MAGIC:
lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
@@ -919,7 +930,7 @@ xlog_recover_buf_commit_pass2(
* the verifier will be reset to match whatever recover turns that
* buffer into.
*/
- lsn = xlog_recover_get_buf_lsn(mp, bp);
+ lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f);
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
trace_xfs_log_recover_buf_skip(log, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 7b79518b6c20..e0072a6cd2d3 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts(
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
- struct xfs_dinode *to)
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
{
to->di_magic = cpu_to_be16(from->di_magic);
to->di_mode = cpu_to_be16(from->di_mode);
@@ -182,7 +183,7 @@ xfs_log_dinode_to_disk(
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
+ to->di_lsn = cpu_to_be64(lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
to->di_flushiter = 0;
@@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2(
}
/*
- * If the inode has an LSN in it, recover the inode only if it's less
- * than the lsn of the transaction we are replaying. Note: we still
- * need to replay an owner change even though the inode is more recent
- * than the transaction as there is no guarantee that all the btree
- * blocks are more recent than this transaction, too.
+ * If the inode has an LSN in it, recover the inode only if the on-disk
+ * inode's LSN is older than the lsn of the transaction we are
+ * replaying. We can have multiple checkpoints with the same start LSN,
+ * so the current LSN being equal to the on-disk LSN doesn't necessarily
+ * mean that the on-disk inode is more recent than the change being
+ * replayed.
+ *
+ * We must check the current_lsn against the on-disk inode
+ * here because the we can't trust the log dinode to contain a valid LSN
+ * (see comment below before replaying the log dinode for details).
+ *
+ * Note: we still need to replay an owner change even though the inode
+ * is more recent than the transaction as there is no guarantee that all
+ * the btree blocks are more recent than this transaction, too.
*/
if (dip->di_version >= 3) {
xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
goto out_owner_change;
@@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2(
goto out_release;
}
- /* recover the log dinode inode into the on disk inode */
- xfs_log_dinode_to_disk(ldip, dip);
+ /*
+ * Recover the log dinode inode into the on disk inode.
+ *
+ * The LSN in the log dinode is garbage - it can be zero or reflect
+ * stale in-memory runtime state that isn't coherent with the changes
+ * logged in this transaction or the changes written to the on-disk
+ * inode. Hence we write the current lSN into the inode because that
+ * matches what xfs_iflush() would write inode the inode when flushing
+ * the changes in this transaction.
+ */
+ xfs_log_dinode_to_disk(ldip, dip, current_lsn);
fields = in_f->ilf_fields;
if (fields & XFS_ILOG_DEV)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 36fa2650b081..60ac5fd63f1e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -78,13 +78,12 @@ xlog_verify_iclog(
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t tail_lsn);
+ struct xlog_in_core *iclog);
#else
#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
-#define xlog_verify_tail_lsn(a,b,c)
+#define xlog_verify_tail_lsn(a,b)
#endif
STATIC int
@@ -487,51 +486,80 @@ out_error:
return error;
}
-static bool
-__xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- lockdep_assert_held(&log->l_icloglock);
-
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
- /* update tail before writing to iclog */
- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
-
- iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
- xlog_verify_tail_lsn(log, iclog, tail_lsn);
- /* cycle incremented when incrementing curr_block */
- trace_xlog_iclog_syncing(iclog, _RET_IP_);
- return true;
- }
-
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
- return false;
-}
-
/*
* Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state.
+ *
+ * If the caller passes in a non-zero @old_tail_lsn and the current log tail
+ * does not match, there may be metadata on disk that must be persisted before
+ * this iclog is written. To satisfy that requirement, set the
+ * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
+ * log tail value.
+ *
+ * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+ * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+ * written to stable storage, and implies that a commit record is contained
+ * within the iclog. We need to ensure that the log tail does not move beyond
+ * the tail that the first commit record in the iclog ordered against, otherwise
+ * correct recovery of that checkpoint becomes dependent on future operations
+ * performed on this iclog.
+ *
+ * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
+ * current tail into iclog. Once the iclog tail is set, future operations must
+ * not modify it, otherwise they potentially violate ordering constraints for
+ * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
+ * the iclog will get zeroed on activation of the iclog after sync, so we
+ * always capture the tail lsn on the iclog on the first NEED_FUA release
+ * regardless of the number of active reference counts on this iclog.
*/
+
int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ xfs_lsn_t old_tail_lsn)
{
+ xfs_lsn_t tail_lsn;
lockdep_assert_held(&log->l_icloglock);
trace_xlog_iclog_release(iclog, _RET_IP_);
if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
- if (atomic_dec_and_test(&iclog->ic_refcnt) &&
- __xlog_state_release_iclog(log, iclog)) {
- spin_unlock(&log->l_icloglock);
- xlog_sync(log, iclog);
- spin_lock(&log->l_icloglock);
+ /*
+ * Grabbing the current log tail needs to be atomic w.r.t. the writing
+ * of the tail LSN into the iclog so we guarantee that the log tail does
+ * not move between deciding if a cache flush is required and writing
+ * the LSN into the iclog below.
+ */
+ if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+ if (old_tail_lsn && tail_lsn != old_tail_lsn)
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+
+ if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
+ !iclog->ic_header.h_tail_lsn)
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
}
+ if (!atomic_dec_and_test(&iclog->ic_refcnt))
+ return 0;
+
+ if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ return 0;
+ }
+
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ if (!iclog->ic_header.h_tail_lsn)
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog);
+ trace_xlog_iclog_syncing(iclog, _RET_IP_);
+
+ spin_unlock(&log->l_icloglock);
+ xlog_sync(log, iclog);
+ spin_lock(&log->l_icloglock);
return 0;
}
@@ -774,6 +802,21 @@ xfs_log_mount_cancel(
}
/*
+ * Flush out the iclog to disk ensuring that device caches are flushed and
+ * the iclog hits stable storage before any completion waiters are woken.
+ */
+static inline int
+xlog_force_iclog(
+ struct xlog_in_core *iclog)
+{
+ atomic_inc(&iclog->ic_refcnt);
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+ return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
+}
+
+/*
* Wait for the iclog and all prior iclogs to be written disk as required by the
* log force state machine. Waiting on ic_force_wait ensures iclog completions
* have been ordered and callbacks run before we are woken here, hence
@@ -827,13 +870,6 @@ xlog_write_unmount_record(
/* account for space used by record data */
ticket->t_curr_res -= sizeof(ulf);
- /*
- * For external log devices, we need to flush the data device cache
- * first to ensure all metadata writeback is on stable storage before we
- * stamp the tail LSN into the unmount record.
- */
- if (log->l_targ != log->l_mp->m_ddev_targp)
- blkdev_issue_flush(log->l_targ->bt_bdev);
return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS);
}
@@ -865,18 +901,7 @@ out_err:
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR);
- /*
- * Ensure the journal is fully flushed and on stable storage once the
- * iclog containing the unmount record is written.
- */
- iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_force_iclog(iclog);
xlog_wait_on_iclog(iclog);
if (tic) {
@@ -1796,10 +1821,20 @@ xlog_write_iclog(
* metadata writeback and causing priority inversions.
*/
iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
- if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH)
+ if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+ /*
+ * For external log devices, we also need to flush the data
+ * device cache first to ensure all metadata writeback covered
+ * by the LSN in this iclog is on stable storage. This is slow,
+ * but it *must* complete before we issue the external log IO.
+ */
+ if (log->l_targ != log->l_mp->m_ddev_targp)
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev);
+ }
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
+
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
@@ -2310,7 +2345,7 @@ xlog_write_copy_finish(
return 0;
release_iclog:
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
return error;
}
@@ -2529,7 +2564,7 @@ next_lv:
ASSERT(optype & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
} else {
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
}
spin_unlock(&log->l_icloglock);
@@ -2567,6 +2602,7 @@ xlog_state_activate_iclog(
memset(iclog->ic_header.h_cycle_data, 0,
sizeof(iclog->ic_header.h_cycle_data));
iclog->ic_header.h_lsn = 0;
+ iclog->ic_header.h_tail_lsn = 0;
}
/*
@@ -2967,7 +3003,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3132,6 +3168,35 @@ xlog_state_switch_iclogs(
}
/*
+ * Force the iclog to disk and check if the iclog has been completed before
+ * xlog_force_iclog() returns. This can happen on synchronous (e.g.
+ * pmem) or fast async storage because we drop the icloglock to issue the IO.
+ * If completion has already occurred, tell the caller so that it can avoid an
+ * unnecessary wait on the iclog.
+ */
+static int
+xlog_force_and_check_iclog(
+ struct xlog_in_core *iclog,
+ bool *completed)
+{
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ int error;
+
+ *completed = false;
+ error = xlog_force_iclog(iclog);
+ if (error)
+ return error;
+
+ /*
+ * If the iclog has already been completed and reused the header LSN
+ * will have been rewritten by completion
+ */
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+ *completed = true;
+ return 0;
+}
+
+/*
* Write out all data in the in-core log as of this exact moment in time.
*
* Data may be written to the in-core log during this call. However,
@@ -3165,7 +3230,6 @@ xfs_log_force(
{
struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
- xfs_lsn_t lsn;
XFS_STATS_INC(mp, xs_log_force);
trace_xfs_log_force(mp, 0, _RET_IP_);
@@ -3193,39 +3257,33 @@ xfs_log_force(
iclog = iclog->ic_prev;
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
if (atomic_read(&iclog->ic_refcnt) == 0) {
- /*
- * We are the only one with access to this iclog.
- *
- * Flush it out now. There should be a roundoff of zero
- * to show that someone has already taken care of the
- * roundoff from the previous sync.
- */
- atomic_inc(&iclog->ic_refcnt);
- lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ /* We have exclusive access to this iclog. */
+ bool completed;
+
+ if (xlog_force_and_check_iclog(iclog, &completed))
goto out_error;
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+ if (completed)
goto out_unlock;
} else {
/*
- * Someone else is writing to this iclog.
- *
- * Use its call to flush out the data. However, the
- * other thread may not force out this LR, so we mark
- * it WANT_SYNC.
+ * Someone else is still writing to this iclog, so we
+ * need to ensure that when they release the iclog it
+ * gets synced immediately as we may be waiting on it.
*/
xlog_state_switch_iclogs(log, iclog, 0);
}
- } else {
- /*
- * If the head iclog is not active nor dirty, we just attach
- * ourselves to the head and go to sleep if necessary.
- */
- ;
}
+ /*
+ * The iclog we are about to wait on may contain the checkpoint pushed
+ * by the above xlog_cil_force() call, but it may not have been pushed
+ * to disk yet. Like the ACTIVE case above, we need to make sure caches
+ * are flushed when this iclog is written.
+ */
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+
if (flags & XFS_LOG_SYNC)
return xlog_wait_on_iclog(iclog);
out_unlock:
@@ -3245,6 +3303,7 @@ xlog_force_lsn(
bool already_slept)
{
struct xlog_in_core *iclog;
+ bool completed;
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
@@ -3258,7 +3317,8 @@ xlog_force_lsn(
goto out_unlock;
}
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
/*
* We sleep here if we haven't already slept (e.g. this is the
* first time we've looked at the correct iclog buf) and the
@@ -3281,12 +3341,31 @@ xlog_force_lsn(
&log->l_icloglock);
return -EAGAIN;
}
- atomic_inc(&iclog->ic_refcnt);
- xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_force_and_check_iclog(iclog, &completed))
goto out_error;
if (log_flushed)
*log_flushed = 1;
+ if (completed)
+ goto out_unlock;
+ break;
+ case XLOG_STATE_WANT_SYNC:
+ /*
+ * This iclog may contain the checkpoint pushed by the
+ * xlog_cil_force_seq() call, but there are other writers still
+ * accessing it so it hasn't been pushed to disk yet. Like the
+ * ACTIVE case above, we need to make sure caches are flushed
+ * when this iclog is written.
+ */
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ break;
+ default:
+ /*
+ * The entire checkpoint was written by the CIL force and is on
+ * its way to disk already. It will be stable when it
+ * completes, so we don't need to manipulate caches here at all.
+ * We just need to wait for completion if necessary.
+ */
+ break;
}
if (flags & XFS_LOG_SYNC)
@@ -3559,10 +3638,10 @@ xlog_verify_grant_tail(
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t tail_lsn)
+ struct xlog_in_core *iclog)
{
- int blocks;
+ xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
+ int blocks;
if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
blocks =
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b128aaa9b870..4c44bc3786c0 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -654,8 +654,9 @@ xlog_cil_push_work(
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
+ xfs_lsn_t preflush_tail_lsn;
xfs_lsn_t commit_lsn;
- xfs_lsn_t push_seq;
+ xfs_csn_t push_seq;
struct bio bio;
DECLARE_COMPLETION_ONSTACK(bdev_flush);
@@ -730,7 +731,15 @@ xlog_cil_push_work(
* because we hold the flush lock exclusively. Hence we can now issue
* a cache flush to ensure all the completed metadata in the journal we
* are about to overwrite is on stable storage.
+ *
+ * Because we are issuing this cache flush before we've written the
+ * tail lsn to the iclog, we can have metadata IO completions move the
+ * tail forwards between the completion of this flush and the iclog
+ * being written. In this case, we need to re-issue the cache flush
+ * before the iclog write. To detect whether the log tail moves, sample
+ * the tail LSN *before* we issue the flush.
*/
+ preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
&bdev_flush);
@@ -941,7 +950,7 @@ restart:
* storage.
*/
commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
- xlog_state_release_iclog(log, commit_iclog);
+ xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
spin_unlock(&log->l_icloglock);
return;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 4c41bbfa33b0..f3e79a45d60a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -59,6 +59,16 @@ enum xlog_iclog_state {
{ XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \
{ XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" }
+/*
+ * In core log flags
+ */
+#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
+
+#define XLOG_ICL_STRINGS \
+ { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
+ { XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" }
+
/*
* Log ticket flags
@@ -143,9 +153,6 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
-#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
-
/* Ticket reservation region accounting */
#define XLOG_TIC_LEN_MAX 15
@@ -497,7 +504,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+ xfs_lsn_t log_tail_lsn);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f9d8d605f9b1..19260291ff8b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3944,6 +3944,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
__field(uint32_t, state)
__field(int32_t, refcount)
__field(uint32_t, offset)
+ __field(uint32_t, flags)
__field(unsigned long long, lsn)
__field(unsigned long, caller_ip)
),
@@ -3952,15 +3953,17 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
__entry->state = iclog->ic_state;
__entry->refcount = atomic_read(&iclog->ic_refcnt);
__entry->offset = iclog->ic_offset;
+ __entry->flags = iclog->ic_flags;
__entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS",
+ TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->state, XLOG_STATE_STRINGS),
__entry->refcount,
__entry->offset,
__entry->lsn,
+ __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
(char *)__entry->caller_ip)
);